zoukankan      html  css  js  c++  java
  • java基础:9.4 web爬虫

    跟随超链接来自动遍历Web.

    package day11;
    import java.util.ArrayList;
    import java.util.Scanner;
    public class WebCrawler {
    	public static void main(String[] args) {
    		Scanner input = new Scanner(System.in);
    		System.out.println("enter a URL(such as:http://wwww.xxxx.com):");
    		String url = input.nextLine();
    		crawler(url);
    	}
    	
    	public static void crawler(String startingURL) {
    		ArrayList<String> listOfPendingURLs = new ArrayList<>();
    		ArrayList<String> listOfTraversedURLs = new ArrayList<>();
    		listOfPendingURLs.add(startingURL) ;
    		int i= 0;
    		while (!listOfPendingURLs.isEmpty() &&   //is.Empty() :empty return true
    				listOfTraversedURLs.size() <= 100) {
    			String urlString = listOfPendingURLs.remove(0);  // remove the first url
    			if (!listOfTraversedURLs.contains(urlString)) {
    				listOfTraversedURLs.add(urlString);
    				System.out.println("Crawl " + ++i + "  "+urlString);
    				
    				for (String s: getSubURLs(urlString)) {
    					if (!listOfTraversedURLs.contains(s))
    						listOfPendingURLs.add (s);
    				}
    			}
    		}
    	}
    	
    	public static ArrayList<String> getSubURLs(String urlString) {
    		ArrayList<String> list = new ArrayList<>() ;
    		
    		try {
    			java.net.URL url = new java.net.URL(urlString);
    			Scanner input = new Scanner(url.openStream());
    			int current = 0;
    			while(input.hasNext()) {
    				String line = input.nextLine();
    				current = line.indexOf("http:",current);
    				while (current > 0) {
    					int endIndex = line.indexOf(""",current);
    				    if (endIndex > 0 ) {
    				    	list.add(line.substring(current,endIndex));
    				    	current = line.indexOf("http:",endIndex);
    				    }
    				    else 
    				    	current = -1;
    				}
    			}
    		}
    		catch (Exception ex) {
    			System.out.println("error:" + ex.getMessage());
    		}
    		
    		return list;
    	}
    }
    
  • 相关阅读:
    第七周总结
    第六周总结
    第五周总结
    第四周总结
    第三周总结
    第二周总结
    第一周总结
    《需求分析和系统设计》阅读笔记三
    《需求分析和系统设计》阅读笔记二
    Linux——error while loading shared libraries 的解决方法
  • 原文地址:https://www.cnblogs.com/l20902/p/10610933.html
Copyright © 2011-2022 走看看