zoukankan      html  css  js  c++  java
  • jsoup httpclient 爬取网页并下载google图标

    jsoup下载地址 http://www.jsoup.org

    httpclient下载地址 http://hc.apache.org/downloads.cgi

    其他jar包见附件

    Crawler

    package jsoup;  
      
    import java.io.File;  
    import java.io.FileOutputStream;  
    import java.io.IOException;  
    import java.io.InputStream;  
    import java.util.HashMap;  
    import java.util.Map;  
      
    import org.apache.commons.io.FileUtils;  
    import org.apache.commons.io.IOUtils;  
    import org.apache.http.HttpEntity;  
    import org.apache.http.HttpResponse;  
    import org.apache.http.HttpStatus;  
    import org.apache.http.client.methods.HttpGet;  
    import org.apache.http.impl.client.DefaultHttpClient;  
    import org.apache.http.params.HttpProtocolParams;  
    import org.apache.http.util.EntityUtils;  
      
    import com.google.api.translate.Language;  
    import com.google.api.translate.Translate;  
      
    /** 
     * google logo 下载程序 
     */  
    public abstract class Crawler {  
      
        /** 
         * 使用google 翻译api 
         *  
         * @param en 
         * @return 
         */  
        public String translateEnToCinese(String en) {  
            Translate.setHttpReferrer("http://www.xxx.com");  
            try {  
                return Translate.execute(en, Language.ENGLISH, Language.CHINESE);  
            } catch (Exception e) {  
                e.printStackTrace();  
            }  
            return "";  
        }  
      
        /** 
         * 获取一个Map 
         *  
         * @return 
         */  
        public Map<String, Object> getMap() {  
            return new HashMap<String, Object>(0);  
        }  
      
        /** 
         * 下载文件 
         *  
         * @param url 
         *            文件http地址 
         * @param dir 
         *            目标文件 
         * @throws IOException 
         */  
        public void downloadFile(String url, String dir) throws Exception {  
            DefaultHttpClient httpClient = new DefaultHttpClient();  
            HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
            HttpGet httpGet = new HttpGet();  
            httpGet.setURI(new java.net.URI(url));  
              
            InputStream input = null;  
            FileOutputStream output = null;  
            try {  
                HttpResponse response = httpClient.execute(httpGet);  
                HttpEntity entity = response.getEntity();  
                input = entity.getContent();  
                File file = new File(dir);  
                output = FileUtils.openOutputStream(file);  
                IOUtils.copy(input, output);  
            } catch (Exception e){  
                e.printStackTrace();  
            } finally {  
                IOUtils.closeQuietly(output);  
                IOUtils.closeQuietly(input);  
            }  
        }  
      
        /** 
         * 处理GET请求,返回整个页面 
         *  
         * @param url 
         *            访问地址 
         * @param params 
         *            编码参数 
         * @return 
         * @throws Exception 
         */  
        public synchronized String doGet(String url, String... params)  
                throws Exception {  
            DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例  
            HttpProtocolParams.setUserAgent(httpClient.getParams(),  
                            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");  
            String charset = "UTF-8";  
            if (null != params && params.length >= 1) {  
                charset = params[0];  
            }  
            HttpGet httpGet = new HttpGet(); // 创建get方法实例  
            String content = "";  
            httpGet.setURI(new java.net.URI(url));  
            try {  
                HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象  
                int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码  
                if (resStatu == HttpStatus.SC_OK) { // 200正常  
                    HttpEntity entity = response.getEntity(); // 获得相应的实体  
                    if (entity != null) {  
                        // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1  
                        content = EntityUtils.toString(entity, charset);  
                    }  
                }  
            } catch (Exception e) {  
                System.out.println("访问【" + url + "】出现异常!");  
                e.printStackTrace();  
            } finally {  
                // 关闭资源  
                httpGet.abort();  
                httpClient.getConnectionManager().shutdown();  
            }  
            return content;  
        }  
    }  

    GoogleLogoCrawler

    package jsoup;  
      
    import java.io.File;  
    import java.io.IOException;  
    import java.util.ArrayList;  
    import java.util.Date;  
    import java.util.List;  
    import java.util.Map;  
      
    import org.apache.commons.io.FileUtils;  
    import org.apache.commons.lang.StringUtils;  
    import org.json.JSONArray;  
    import org.json.JSONObject;  
    import org.jsoup.Jsoup;  
    import org.jsoup.nodes.Document;  
    import org.jsoup.nodes.Element;  
    import org.jsoup.select.Elements;  
      
    /** 
     * google logo 下载程序 
     */  
    public class GoogleLogoCrawler extends Crawler {  
          
        private static final String URL = "http://www.logocollect.com/google/year.php?key=%y&page=%p";   
      
        private static final String LOGO_URL = "http://www.logocollect.com/google/";  
      
        private static final String[] YEARS = new String[] {   
                //"1998", "1999", "2000",  
                //"2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008",   
                "2009", "2010", "2011", "2012" };  
      
        private static final String INDEX = "http://www.logocollect.com/google/year.php?key=%y";   
      
        private static final String DIR_PATH = "D:\googlelogos\";  
      
        public void doStart() {  
            JSONArray array = new JSONArray();  
            for (String year : YEARS) {  
                String ind = INDEX.replaceAll("%y", year);  
                int pageCount = getPageCount(ind);  
                for (int i = 1; i < pageCount+1; i++) {  
                    String url = URL.replaceAll("%y", year).replaceAll("%p", i + "");  
                    String path = year + "_" + i;  
                    start(url, array, DIR_PATH + path + "\", path);  
                }  
            }  
            try {  
                FileUtils.writeStringToFile(new File(DIR_PATH + "json"), array.toString(), "UTF-8");  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
            System.out.println(array);  
        }  
          
        public int getPageCount(String url) {  
            int pageCount = 1;  
            try {  
                org.jsoup.nodes.Document doc = Jsoup.connect(url).get();  
                  
                String els = doc.html().toString();  
                int start = els.indexOf("总页数") + 4;  
                String temp = els.substring(start);  
                int end = temp.indexOf(",");  
                pageCount = Integer.parseInt(els.substring(start,start+end));  
                System.out.println(pageCount);  
            } catch (IOException e) {  
                e.printStackTrace();  
            }  
            return pageCount;  
        }  
      
        public void start(String url, JSONArray array, String dir, String path) {  
            try {  
                String content = super.doGet(url);  
                Document doc = Jsoup.parse(content);  
                Elements dds = doc.select(".img img");  
                List<Map<String, Object>> list = new ArrayList<Map<String, Object>>(0);  
                for (int i = 0; i < dds.size(); i++) {  
                    Element img = dds.get(i);  
                    String src = img.select("img").first().attr("src");  
                    String title = img.select("img").first().attr("title");  
                    Map<String, Object> map = super.getMap();  
                      
                    map.put("url", LOGO_URL + src);  
                    map.put("title", title);  
                      
                    list.add(map);  
                }  
                JSONArray tempJsonArray = new JSONArray();  
                for (Map<String, Object> map : list) {  
                    JSONObject jsonObject = new JSONObject();  
                    String proxy = StringUtils.substringAfterLast(map.get("url")  
                            .toString(), ".");  
                    long date = new Date().getTime();  
                    String name = date + "." + proxy;  
                    jsonObject.put("url", map.get("url").toString());  
                    jsonObject.put("dir", name);  
                    jsonObject.put("title", map.get("title").toString());  
                      
                    // 翻译  
    //              String dateZh = super.translateEnToCinese(map.get("date")  
    //                      .toString());  
    //              String titleZh = super.translateEnToCinese(map.get("title")  
    //                      .toString());  
    //              json.put("title_zh_cn", dateZh + " - " + titleZh);  
                      
                    // 下载图片  
                    super.downloadFile(map.get("url").toString(), dir + name);  
                    tempJsonArray.put(jsonObject);  
                }  
                array.put(new JSONObject().put(path, tempJsonArray));  
            } catch (Exception e) {  
                e.printStackTrace();  
            }  
        }  
      
        public static void main(String[] args) throws Exception {  
            new GoogleLogoCrawler().doStart();  
        }  
      
    }  

    本文转自:http://you-java.iteye.com/blog/1460271

  • 相关阅读:
    python-变量
    Python-编码
    Linux中 set、env、declare、export显示shell变量的区别
    iOS 为移动中的UIView(UIButton )添加点击事件
    iOS 8 TabBar 图片显示真实颜色
    Error Domain=ASIHTTPRequestErrorDomain Code=8 "Failed to move file from"xxx/xxx"to"xxx/xxx"
    iOS 判断View 是否是第一次显示
    编写程序时的注意事项
    iOS 图片填充 UIImageView (contentMode)
    修改 UISearchBar cancelButton 样式
  • 原文地址:https://www.cnblogs.com/dreammyle/p/4150003.html
Copyright © 2011-2022 走看看