zoukankan      html  css  js  c++  java
  • 网络爬虫Java实现抓取网页内容

    package 抓取网页;

    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.OutputStream;

    import org.apache.commons.httpclient.HttpClient;
    import org.apache.commons.httpclient.HttpException;
    import org.apache.commons.httpclient.HttpStatus;
    import org.apache.commons.httpclient.NameValuePair;
    import org.apache.commons.httpclient.methods.PostMethod;

    public class RetrivePage {

    private static HttpClient httpClient = new HttpClient();

    public static void main(String[] args) {
    //抓取猎兔的首页,并且输出出来
    try {
    RetrivePage.downloadPage("http://www.lietu.com");
    } catch (HttpException e) {
    e.printStackTrace();
    } catch (IOException e) {
    e.printStackTrace();
    }
    }

    private static void downloadPage(String path) throws HttpException, IOException {
    System.out.println("123123");
    InputStream input = null;
    OutputStream output = null;
    //得到post方法
    PostMethod postMethod = new PostMethod(path);
    //设置post方法的参数
    NameValuePair[] postData = new NameValuePair[2];
    postData[0] = new NameValuePair("name","lietu");
    postData[1] = new NameValuePair("password","*****");
    //把参数添加到请求路径上去
    postMethod.addParameters(postData);
    //执行,返回状态码
    int statusCode = httpClient.executeMethod(postMethod);
    System.out.println(statusCode);
    if (statusCode == HttpStatus.SC_OK) {
    input = postMethod.getResponseBodyAsStream();
    //得到文件的名字
    String fileName = path.substring(path.lastIndexOf('/')+1);
    //获得文件的输出流
    System.out.println(fileName);
    output = new FileOutputStream(fileName);

    //输出到文件中
    int tempByte = -1;
    while ((tempByte = input.read()) > 0) {
    output.write(tempByte);
    }
    //关闭资源
    if (input != null) {
    input.close();
    }
    if (output != null) {
    output.close();
    }
    }
    }
    }

  • 相关阅读:
    这些HTML、CSS知识点,面试和平时开发都需要 No1-No4(知识点:HTML、CSS、盒子模型、内容布局)
    skywalking在 .net Framework客户端使用
    websoket的扫码登陆简单用法
    微信授权登陆nginx代理
    本地下载文件的方法(兼容下载图片和视频)
    Vue-给对象新增属性(使用Vue.$set())
    浏览器解析URL的过程
    promise es6,es7
    filter全局方法的写法
    监听滚动条
  • 原文地址:https://www.cnblogs.com/airycode/p/5561015.html
Copyright © 2011-2022 走看看