zoukankan      html  css  js  c++  java
  • 【搜索引擎Jediael开发笔记2】使用HttpClient下载网页至本地文件

    本文使用HttpClient根据url进行网页下载。其中

    (1)HttpClient的相关知识请参见 HttpClient基础教程

    (2)


    package org.ljh.search.downloadpage;
    
    import java.io.FileNotFoundException;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.PrintWriter;
    import java.io.Writer;
    import java.util.Scanner;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.HttpStatus;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    
    //本类用于将指定url对应的网页下载至本地一个文件。
    public class PageDownloader {
    
    	public static void downloadPageByGetMethod(String url) throws IOException {
    
    		// 1、通过HttpGet获取到response对象
    		CloseableHttpClient httpClient = HttpClients.createDefault();
    		// 注意,必需要加上http://的前缀,否则会报:Target host is null异常。
    		HttpGet httpGet = new HttpGet(url);
    		CloseableHttpResponse response = httpClient.execute(httpGet);
    
    		InputStream is = null;
    		if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
    			try {
    				// 2、获取response的entity。
    				HttpEntity entity = response.getEntity();
    
    				// 3、获取到InputStream对象,并对内容进行处理
    				is = entity.getContent();
    
    				String fileName = getFileName(url);
    				saveToFile("D:\tmp\", fileName, is);
    			} catch (ClientProtocolException e) {
    				e.printStackTrace();
    			} finally {
    
    				if (is != null) {
    					is.close();
    				}
    				if (response != null) {
    					response.close();
    				}
    			}
    		}
    	}
    
    	//将输入流中的内容输出到path指定的路径,fileName指定的文件名
    	private static void saveToFile(String path, String fileName, InputStream is) {
    		Scanner sc = new Scanner(is);
    		Writer os = null;
    		try {
    			os = new PrintWriter(path + fileName);
    			while (sc.hasNext()) {
    				os.write(sc.nextLine());
    			}
    		} catch (FileNotFoundException e) {
    			e.printStackTrace();
    		} catch (IOException e) {
    			e.printStackTrace();
    		} finally {
    			if (sc != null) {
    				sc.close();
    			}
    			if (os != null) {
    				try{
    				os.flush();
    				os.close();
    				}catch(IOException e){
    					e.printStackTrace();
    					System.out.println("输出流关闭失败!");
    				}
    			}
    		}
    	}
    
    	// 将url中的特殊字符用下划线代替
    	private static String getFileName(String url) {
    		url = url.substring(7);
    		String fileName = url.replaceAll("[\?:*|<>"/]", "_") + ".html";
    		return fileName;
    	}
    
    }
    



  • 相关阅读:
    Idea破解2019
    Navicat Premium 12破解激活
    Java高并发程序设计学习笔记(十一):Jetty分析
    Java高并发程序设计学习笔记(十):并发调试和JDK8新特性
    Java高并发程序设计学习笔记(九):锁的优化和注意事项
    Java高并发程序设计学习笔记(八):NIO和AIO
    Java高并发程序设计学习笔记(七):并行设计模式
    Java高并发程序设计学习笔记(六):JDK并发包(线程池的基本使用、ForkJoin)
    推荐一套WPF主题皮肤
    WPF中的动画——(五)关键帧动画
  • 原文地址:https://www.cnblogs.com/jediael/p/4304147.html
Copyright © 2011-2022 走看看