zoukankan      html  css  js  c++  java
  • Java 爬虫

    import java.awt.BorderLayout;
    import java.awt.Cursor;
    import java.awt.Font;
    import java.awt.GridBagConstraints;
    import java.awt.GridBagLayout;
    import java.awt.Insets;
    import java.awt.event.ActionEvent;
    import java.awt.event.ActionListener;
    import java.awt.event.KeyEvent;
    import java.awt.event.WindowAdapter;
    import java.awt.event.WindowEvent;
    import java.io.BufferedReader;
    import java.io.FileWriter;
    import java.io.InputStreamReader;
    import java.io.PrintWriter;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.HashSet;
    import java.util.LinkedHashSet;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
     
    import javax.swing.BorderFactory;
    import javax.swing.JButton;
    import javax.swing.JCheckBox;
    import javax.swing.JComboBox;
    import javax.swing.JFrame;
    import javax.swing.JLabel;
    import javax.swing.JMenu;
    import javax.swing.JMenuBar;
    import javax.swing.JMenuItem;
    import javax.swing.JOptionPane;
    import javax.swing.JPanel;
    import javax.swing.JProgressBar;
    import javax.swing.JScrollPane;
    import javax.swing.JSeparator;
    import javax.swing.JTable;
    import javax.swing.JTextField;
    import javax.swing.table.DefaultTableModel;
     
     
    public class SearchCrawler extends JFrame {
    private static final String[] MAX_URLS = { "50", "100", "500", "1000" };
     
    // Cache of robot disallow lists.
    private HashMap disallowListCache = new HashMap();
     
    // Search GUI controls.
    private JTextField startTextField;
     
    private JComboBox maxComboBox;
     
    private JCheckBox limitCheckBox;
     
    private JTextField logTextField;
     
    private JTextField searchTextField;
     
    private JCheckBox caseCheckBox;
     
    private JButton searchButton;
     
    // Search stats GUI controls.
    private JLabel crawlingLabel2;
     
    private JLabel crawledLabel2;
     
    private JLabel toCrawlLabel2;
     
    private JProgressBar progressBar;
     
    private JLabel matchesLabel2;
     
    // Table listing search matches.
    private JTable table;
     
    // Flag for whether or not crawling is underway.
    private boolean crawling;
     
    // Matches log file print writer.
    private PrintWriter logFileWriter;
     
    // Constructor for Search Web Crawler.
    public SearchCrawler() {
    // Set application title.
    setTitle("Search Crawler");
     
    // Set window size.
    setSize(600, 600);
     
    // Handle window closing events.
    addWindowListener(new WindowAdapter() {
    public void windowClosing(WindowEvent e) {
    actionExit();
    }
    });
     
    // Set up File menu.
    JMenuBar menuBar = new JMenuBar();
    JMenu fileMenu = new JMenu("File");
    fileMenu.setMnemonic(KeyEvent.VK_F);
    JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X);
    fileExitMenuItem.addActionListener(new ActionListener() {
    public void actionPerformed(ActionEvent e) {
    actionExit();
    }
    });
    fileMenu.add(fileExitMenuItem);
    menuBar.add(fileMenu);
    setJMenuBar(menuBar);
     
    // Set up search panel.
    JPanel searchPanel = new JPanel();
    GridBagConstraints constraints;
    GridBagLayout layout = new GridBagLayout();
    searchPanel.setLayout(layout);
     
    JLabel startLabel = new JLabel("Start URL:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(startLabel, constraints);
    searchPanel.add(startLabel);
     
    startTextField = new JTextField();
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(startTextField, constraints);
    searchPanel.add(startTextField);
     
    JLabel maxLabel = new JLabel("Max URLs to Crawl:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(maxLabel, constraints);
    searchPanel.add(maxLabel);
     
    maxComboBox = new JComboBox(MAX_URLS);
    maxComboBox.setEditable(true);
    constraints = new GridBagConstraints();
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(maxComboBox, constraints);
    searchPanel.add(maxComboBox);
     
    limitCheckBox = new JCheckBox("Limit crawling to Start URL site");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.WEST;
    constraints.insets = new Insets(0, 10, 0, 0);
    layout.setConstraints(limitCheckBox, constraints);
    searchPanel.add(limitCheckBox);
     
    JLabel blankLabel = new JLabel();
    constraints = new GridBagConstraints();
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    layout.setConstraints(blankLabel, constraints);
    searchPanel.add(blankLabel);
     
    JLabel logLabel = new JLabel("Matches Log File:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(logLabel, constraints);
    searchPanel.add(logLabel);
     
    String file = System.getProperty("user.dir")
    + System.getProperty("file.separator") + "crawler.log";
    logTextField = new JTextField(file);
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(logTextField, constraints);
    searchPanel.add(logTextField);
     
    JLabel searchLabel = new JLabel("Search String:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(searchLabel, constraints);
    searchPanel.add(searchLabel);
     
    searchTextField = new JTextField();
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.insets = new Insets(5, 5, 0, 0);
    constraints.gridwidth = 2;
    constraints.weightx = 1.0d;
    layout.setConstraints(searchTextField, constraints);
    searchPanel.add(searchTextField);
     
    caseCheckBox = new JCheckBox("Case Sensitive");
    constraints = new GridBagConstraints();
    constraints.insets = new Insets(5, 5, 0, 5);
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    layout.setConstraints(caseCheckBox, constraints);
    searchPanel.add(caseCheckBox);
     
    searchButton = new JButton("Search");
    searchButton.addActionListener(new ActionListener() {
    public void actionPerformed(ActionEvent e) {
    actionSearch();
    }
    });
    constraints = new GridBagConstraints();
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 5, 5);
    layout.setConstraints(searchButton, constraints);
    searchPanel.add(searchButton);
     
    JSeparator separator = new JSeparator();
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 5, 5);
    layout.setConstraints(separator, constraints);
    searchPanel.add(separator);
     
    JLabel crawlingLabel1 = new JLabel("Crawling:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(crawlingLabel1, constraints);
    searchPanel.add(crawlingLabel1);
     
    crawlingLabel2 = new JLabel();
    crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(crawlingLabel2, constraints);
    searchPanel.add(crawlingLabel2);
     
    JLabel crawledLabel1 = new JLabel("Crawled URLs:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(crawledLabel1, constraints);
    searchPanel.add(crawledLabel1);
     
    crawledLabel2 = new JLabel();
    crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(crawledLabel2, constraints);
    searchPanel.add(crawledLabel2);
     
    JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(toCrawlLabel1, constraints);
    searchPanel.add(toCrawlLabel1);
     
    toCrawlLabel2 = new JLabel();
    toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(toCrawlLabel2, constraints);
    searchPanel.add(toCrawlLabel2);
     
    JLabel progressLabel = new JLabel("Crawling Progress:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 0, 0);
    layout.setConstraints(progressLabel, constraints);
    searchPanel.add(progressLabel);
     
    progressBar = new JProgressBar();
    progressBar.setMinimum(0);
    progressBar.setStringPainted(true);
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 0, 5);
    layout.setConstraints(progressBar, constraints);
    searchPanel.add(progressBar);
     
    JLabel matchesLabel1 = new JLabel("Search Matches:");
    constraints = new GridBagConstraints();
    constraints.anchor = GridBagConstraints.EAST;
    constraints.insets = new Insets(5, 5, 10, 0);
    layout.setConstraints(matchesLabel1, constraints);
    searchPanel.add(matchesLabel1);
    matchesLabel2 = new JLabel();
    matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
    constraints = new GridBagConstraints();
    constraints.fill = GridBagConstraints.HORIZONTAL;
    constraints.gridwidth = GridBagConstraints.REMAINDER;
    constraints.insets = new Insets(5, 5, 10, 5);
    layout.setConstraints(matchesLabel2, constraints);
    searchPanel.add(matchesLabel2);
     
    // Set up matches table.
    table = new JTable(new DefaultTableModel(new Object[][] {},
    new String[] { "URL" }) {
    public boolean isCellEditable(int row, int column) {
    return false;
    }
    });
     
    // Set up Matches panel.
    JPanel matchesPanel = new JPanel();
    matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches"));
    matchesPanel.setLayout(new BorderLayout());
    matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER);
     
    // Add panels to display.
    getContentPane().setLayout(new BorderLayout());
    getContentPane().add(searchPanel, BorderLayout.NORTH);
    getContentPane().add(matchesPanel, BorderLayout.CENTER);
    }
     
    // Exit this program.
    private void actionExit() {
    System.exit(0);
    }
     
    // Handle Search/Stop button being clicked.
    private void actionSearch() {
    // If stop button clicked, turn crawling flag off.
    if (crawling) {
    crawling = false;
    return;
    }
     
    ArrayList errorList = new ArrayList();
     
    // Validate that start URL has been entered.
    String startUrl = startTextField.getText().trim();
    if (startUrl.length() < 1) {
    errorList.add("Missing Start URL.");
    }
    // Verify start URL.
    else if (verifyUrl(startUrl) == null) {
    errorList.add("Invalid Start URL.");
    }
     
    // Validate that Max URLs is either empty or is a number.
    int maxUrls = 0;
    String max = ((String) maxComboBox.getSelectedItem()).trim();
    if (max.length() > 0) {
    try {
    maxUrls = Integer.parseInt(max);
    } catch (NumberFormatException e) {
    }
    if (maxUrls < 1) {
    errorList.add("Invalid Max URLs value.");
    }
    }
     
    // Validate that matches log file has been entered.
    String logFile = logTextField.getText().trim();
    if (logFile.length() < 1) {
    errorList.add("Missing Matches Log File.");
    }
     
    // Validate that search string has been entered.
    String searchString = searchTextField.getText().trim();
    if (searchString.length() < 1) {
    errorList.add("Missing Search String.");
    }
     
    // Show errors, if any, and return.
    if (errorList.size() > 0) {
    StringBuffer message = new StringBuffer();
     
    // Concatenate errors into single message.
    for (int i = 0; i < errorList.size(); i++) {
    message.append(errorList.get(i));
    if (i + 1 < errorList.size()) {
    message.append("
    ");
    }
    }
     
    showError(message.toString());
    return;
    }
     
    // Remove "www" from start URL if present.
    startUrl = removeWwwFromUrl(startUrl);
     
    // Start the Search Crawler.
    search(logFile, startUrl, maxUrls, searchString);
    }
     
    private void search(final String logFile, final String startUrl,
    final int maxUrls, final String searchString) {
    // Start the search in a new thread.
    Thread thread = new Thread(new Runnable() {
    public void run() {
    // Show hour glass cursor while crawling is under way.
    setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
     
    // Disable search controls.
    startTextField.setEnabled(false);
    maxComboBox.setEnabled(false);
    limitCheckBox.setEnabled(false);
    logTextField.setEnabled(false);
    searchTextField.setEnabled(false);
    caseCheckBox.setEnabled(false);
     
    // Switch Search button to "Stop."
    searchButton.setText("Stop");
     
    // Reset stats.
    table.setModel(new DefaultTableModel(new Object[][] {},
    new String[] { "URL" }) {
    public boolean isCellEditable(int row, int column) {
    return false;
    }
    });
    updateStats(startUrl, 0, 0, maxUrls);
     
    // Open matches log file.
    try {
    logFileWriter = new PrintWriter(new FileWriter(logFile));
    } catch (Exception e) {
    showError("Unable to open matches log file.");
    return;
    }
     
    // Turn crawling flag on.
    crawling = true;
     
    // Perform the actual crawling.
    crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
    searchString, caseCheckBox.isSelected());
     
    // Turn crawling flag off.
    crawling = false;
     
    // Close matches log file.
    try {
    logFileWriter.close();
    } catch (Exception e) {
    showError("Unable to close matches log file.");
    }
     
    // Mark search as done.
    crawlingLabel2.setText("Done");
     
    // Enable search controls.
    startTextField.setEnabled(true);
    maxComboBox.setEnabled(true);
    limitCheckBox.setEnabled(true);
    logTextField.setEnabled(true);
    searchTextField.setEnabled(true);
    caseCheckBox.setEnabled(true);
     
    // Switch search button back to "Search."
    searchButton.setText("Search");
     
    // Return to default cursor.
    setCursor(Cursor.getDefaultCursor());
     
    // Show message if search string not found.
    if (table.getRowCount() == 0) {
    JOptionPane
    .showMessageDialog(
    SearchCrawler.this,
    "Your Search String was not found. Please try another.",
    "Search String Not Found",
    JOptionPane.WARNING_MESSAGE);
    }
    }
    });
    thread.start();
    }
     
    // Show dialog box with error message.
    private void showError(String message) {
    JOptionPane.showMessageDialog(this, message, "Error",
    JOptionPane.ERROR_MESSAGE);
    }
     
    // Update crawling stats.
    private void updateStats(String crawling, int crawled, int toCrawl,
    int maxUrls) {
    crawlingLabel2.setText(crawling);
    crawledLabel2.setText("" + crawled);
    toCrawlLabel2.setText("" + toCrawl);
     
    // Update progress bar.
    if (maxUrls == -1) {
    progressBar.setMaximum(crawled + toCrawl);
    } else {
    progressBar.setMaximum(maxUrls);
    }
    progressBar.setValue(crawled);
     
    matchesLabel2.setText("" + table.getRowCount());
    }
     
    // Add match to matches table and log file.
    private void addMatch(String url) {
    // Add URL to matches table.
    DefaultTableModel model = (DefaultTableModel) table.getModel();
    model.addRow(new Object[] { url });
     
    // Add URL to matches log file.
    try {
    logFileWriter.println(url);
    } catch (Exception e) {
    showError("Unable to log match.");
    }
    }
     
    // Verify URL format.
    private URL verifyUrl(String url) {
    // Only allow HTTP URLs.
    if (!url.toLowerCase().startsWith("http://"))
    return null;
     
    // Verify format of URL.
    URL verifiedUrl = null;
    try {
    verifiedUrl = new URL(url);
    } catch (Exception e) {
    return null;
    }
     
    return verifiedUrl;
    }
     
    // Check if robot is allowed to access the given URL.
    private boolean isRobotAllowed(URL urlToCheck) {
    String host = urlToCheck.getHost().toLowerCase();
     
    // Retrieve host's disallow list from cache.
    ArrayList disallowList = (ArrayList) disallowListCache.get(host);
     
    // If list is not in the cache, download and cache it.
    if (disallowList == null) {
    disallowList = new ArrayList();
     
    try {
    URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
     
    // Open connection to robot file URL for reading.
    BufferedReader reader = new BufferedReader(
    new InputStreamReader(robotsFileUrl.openStream()));
     
    // Read robot file, creating list of disallowed paths.
    String line;
    while ((line = reader.readLine()) != null) {
    if (line.indexOf("Disallow:") == 0) {
    String disallowPath = line.substring("Disallow:"
    .length());
     
    // Check disallow path for comments and remove if
    // present.
    int commentIndex = disallowPath.indexOf("#");
    if (commentIndex != -1) {
    disallowPath = disallowPath.substring(0,
    commentIndex);
    }
     
    // Remove leading or trailing spaces from disallow path.
    disallowPath = disallowPath.trim();
     
    // Add disallow path to list.
    disallowList.add(disallowPath);
    }
    }
     
    // Add new disallow list to cache.
    disallowListCache.put(host, disallowList);
    } catch (Exception e) {
    /*
    * Assume robot is allowed since an exception is thrown if the
    * robot file doesn't exist.
    */
    return true;
    }
    }
     
    /*
    * Loop through disallow list to see if crawling is allowed for the
    * given URL.
    */
    String file = urlToCheck.getFile();
    for (int i = 0; i < disallowList.size(); i++) {
    String disallow = (String) disallowList.get(i);
    if (file.startsWith(disallow)) {
    return false;
    }
    }
     
    return true;
    }
     
    // Download page at given URL.
    private String downloadPage(URL pageUrl) {
    try {
    // Open connection to URL for reading.
    BufferedReader reader = new BufferedReader(new InputStreamReader(
    pageUrl.openStream()));
     
    // Read page into buffer.
    String line;
    StringBuffer pageBuffer = new StringBuffer();
    while ((line = reader.readLine()) != null) {
    pageBuffer.append(line);
    }
     
    return pageBuffer.toString();
    } catch (Exception e) {
    }
     
    return null;
    }
     
    // Remove leading "www" from a URL's host if present.
    private String removeWwwFromUrl(String url) {
    int index = url.indexOf("://www.");
    if (index != -1) {
    return url.substring(0, index + 3) + url.substring(index + 7);
    }
     
    return (url);
    }
     
    // Parse through page contents and retrieve links.
    private ArrayList retrieveLinks(URL pageUrl, String pageContents,
    HashSet crawledList, boolean limitHost) {
    // Compile link matching pattern.
    Pattern p = Pattern.compile("<a\s+href\s*=\s*"?(.*?)[" |>]",
    Pattern.CASE_INSENSITIVE);
    Matcher m = p.matcher(pageContents);
     
    // Create list of link matches.
    ArrayList linkList = new ArrayList();
    while (m.find()) {
    String link = m.group(1).trim();
     
    // Skip empty links.
    if (link.length() < 1) {
    continue;
    }
     
    // Skip links that are just page anchors.
    if (link.charAt(0) == '#') {
    continue;
    }
     
    // Skip mailto links.
    if (link.indexOf("mailto:") != -1) {
    continue;
    }
     
    // Skip JavaScript links.
    if (link.toLowerCase().indexOf("javascript") != -1) {
    continue;
    }
     
    // Prefix absolute and relative URLs if necessary.
    if (link.indexOf("://") == -1) {
    // Handle absolute URLs.
    if (link.charAt(0) == '/') {
    link = "http://" + pageUrl.getHost() + link;
    // Handle relative URLs.
    } else {
    String file = pageUrl.getFile();
    if (file.indexOf('/') == -1) {
    link = "http://" + pageUrl.getHost() + "/" + link;
    } else {
    String path = file.substring(0,
    file.lastIndexOf('/') + 1);
    link = "http://" + pageUrl.getHost() + path + link;
    }
    }
    }
     
    // Remove anchors from link.
    int index = link.indexOf('#');
    if (index != -1) {
    link = link.substring(0, index);
    }
     
    // Remove leading "www" from URL's host if present.
    link = removeWwwFromUrl(link);
     
    // Verify link and skip if invalid.
    URL verifiedLink = verifyUrl(link);
    if (verifiedLink == null) {
    continue;
    }
     
    /*
    * If specified, limit links to those having the same host as the
    * start URL.
    */
    if (limitHost
    && !pageUrl.getHost().toLowerCase().equals(
    verifiedLink.getHost().toLowerCase())) {
    continue;
    }
     
    // Skip link if it has already been crawled.
    if (crawledList.contains(link)) {
    continue;
    }
     
    // Add link to list.
    linkList.add(link);
    }
     
    return (linkList);
    }
     
    /*
    * Determine whether or not search string is matched in the given page
    * contents.
    */
    private boolean searchStringMatches(String pageContents,
    String searchString, boolean caseSensitive) {
    String searchContents = pageContents;
     
    /*
    * If case-sensitive search, lowercase page contents for comparison.
    */
    if (!caseSensitive) {
    searchContents = pageContents.toLowerCase();
    }
    // Split search string into individual terms.
    Pattern p = Pattern.compile("[\s]+");
    String[] terms = p.split(searchString);
     
    // Check to see if each term matches.
    for (int i = 0; i < terms.length; i++) {
    if (caseSensitive) {
    if (searchContents.indexOf(terms[i]) == -1) {
    return false;
    }
    } else {
    if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
    return false;
    }
    }
    }
     
    return true;
    }
     
    // Perform the actual crawling, searching for the search string.
    public void crawl(String startUrl, int maxUrls, boolean limitHost,
    String searchString, boolean caseSensitive) {
    // Set up crawl lists.
    HashSet crawledList = new HashSet();
    LinkedHashSet toCrawlList = new LinkedHashSet();
     
    // Add start URL to the to crawl list.
    toCrawlList.add(startUrl);
     
    /*
    * Perform actual crawling by looping through the To Crawl list.
    */
    while (crawling && toCrawlList.size() > 0) {
    /*
    * Check to see if the max URL count has been reached, if it was
    * specified.
    */
    if (maxUrls != -1) {
    if (crawledList.size() == maxUrls) {
    break;
    }
    }
     
    // Get URL at bottom of the list.
    String url = (String) toCrawlList.iterator().next();
    System.out.println(url);
    // Remove URL from the To Crawl list.
    toCrawlList.remove(url);
     
    // Convert string url to URL object.
    URL verifiedUrl = verifyUrl(url);
     
    // Skip URL if robots are not allowed to access it.
    if (!isRobotAllowed(verifiedUrl)) {
    continue;
    }
     
    // Update crawling stats.
    updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
     
    // Add page to the crawled list.
    crawledList.add(url);
     
    // Download the page at the given URL.
    String pageContents = downloadPage(verifiedUrl);
     
    /*
    * If the page was downloaded successfully, retrieve all its links
    * and then see if it contains the search string.
    */
    if (pageContents != null && pageContents.length() > 0) {
    // Retrieve list of valid links from page.
    ArrayList links = retrieveLinks(verifiedUrl, pageContents,
    crawledList, limitHost);
     
    // Add links to the To Crawl list.
    toCrawlList.addAll(links);
     
    /*
    * Check if search string is present in page, and if so, record
    * a match.
    */
    if (searchStringMatches(pageContents, searchString,
    caseSensitive)) {
    addMatch(url);
    }
    }
     
    // Update crawling stats.
    updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
    }
    }
     
    // Run the Search Crawler.
    public static void main(String[] args) {
    SearchCrawler crawler = new SearchCrawler();
    crawler.show();
    } // Max URLs drop-down values.
    }
     
    
       
  • 相关阅读:
    jupyter中使用熟悉的vim
    解决安装manjaro中安装ccs10.2的库缺失问题
    Markdown中公式
    诗就应该边读边品的,不要
    为neovim田间python3支持
    bilibili视频保存目录
    新工科教育--之我所见
    父母的爱 和汽车的后背箱
    解决manjaro中jupyter无法修改目录和默认浏览器的问题:
    解决jupyter的能打开python文件无法新建的问题
  • 原文地址:https://www.cnblogs.com/tested/p/3880400.html
Copyright © 2011-2022 走看看