zoukankan      html  css  js  c++  java
  • 使用Java调用谷歌搜索

           ----------------------疯狂软件java培训分享---------------------

      自己没搜索引擎,又想要大规模的数据源,怎么办?可以对百度搜索和谷歌搜索善加利用,以小搏大,站在巨人的肩膀上。有很多的应用场景可以很巧妙地借助百度搜索和谷歌搜索来实现,比如网站的新闻采集,比如技术、品牌的新闻跟踪,比如知识库的收集,比如人机问答系统等,我之前做的一个准确率达百分之九十几的人机问答系统的数据源,其中一部分就是充分利用了百度搜索和谷歌搜索。在此演示的技术的基础上,可以容易地扩展到其他的搜索引擎,可以借鉴使用的NekoHTML+XPath技术,轻松获取页面的自定义的内容。

      Java代码

      package org.apdplat.demo.search;

      import java.io.IOException;

      import java.io.InputStream;

      import java.io.UnsupportedEncodingException;

      import java.net.URLEncoder;

      import java.util.ArrayList;

      import java.util.List;

      import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;

      import org.apache.commons.httpclient.HttpClient;

      import org.apache.commons.httpclient.HttpStatus;

      import org.apache.commons.httpclient.methods.GetMethod;

      import org.apache.commons.httpclient.params.HttpMethodParams;

      import org.json.JSONArray;

      import org.json.JSONException;

      import org.json.JSONObject;

      import org.slf4j.Logger;

      import org.slf4j.LoggerFactory;

      public class GoogleSearcher {

      private static final Logger LOG = LoggerFactory.getLogger

      (GoogleSearcher.class);

      public static List searchGoogle(String url) {

      List webpages = new ArrayList<>();

      try {

      HttpClient httpClient = new HttpClient();

      GetMethod getMethod = new GetMethod(url);

      httpClient.executeMethod(getMethod);

      getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,

      new DefaultHttpMethodRetryHandler());

      int statusCode = httpClient.executeMethod(getMethod);

      if (statusCode != HttpStatus.SC_OK) {

      LOG.error("搜索失败: " + getMethod.getStatusLine());

      return null;

      }

      InputStream in = getMethod.getResponseBodyAsStream();

      byte[] responseBody = Tools.readAll(in);

      String response = new String(responseBody, "UTF-8");

      LOG.debug("搜索返回数据:" + response);

      JSONObject json = new JSONObject(response);

      String totalResult = json.getJSONObject

      ("responseData").getJSONObject("cursor").getString("estimatedResultCount");

      int totalResultCount = Integer.parseInt(totalResult);

      LOG.info("搜索返回记录数: " + totalResultCount);

      JSONArray results = json.getJSONObject

      ("responseData").getJSONArray("results");

      LOG.debug("搜索结果:");

      for (int i = 0; i < results.length(); i++) {

      Webpage webpage = new Webpage();

      JSONObject result = results.getJSONObject(i);

      //提取标题

      String title = result.getString("titleNoFormatting");

      LOG.debug("标题:" + title);

      webpage.setTitle(title);

      //提取摘要

      String summary = result.get("content").toString();

      summary = summary.replaceAll("", "");

      summary = summary.replaceAll("", "");

      summary = summary.replaceAll("...", "");

      LOG.debug("摘要:" + summary);

      webpage.setSummary(summary);

      //从URL中提取正文

      String _url = result.get("url").toString();

      webpage.setUrl(_url);

      String content = Tools.getHTMLContent(_url);

      LOG.debug("正文:" + content);

      webpage.setContent(content);

      webpages.add(webpage);

      }

      } catch (IOException | JSONException | NumberFormatException e) {

      LOG.error("执行搜索失败:", e);

      }

      return webpages;

      }

      public static void main(String args[]) {

      String query = "杨尚川";

      try {

      query = URLEncoder.encode(query, "UTF-8");

      } catch (UnsupportedEncodingException e) {

      LOG.error("url构造失败", e);

      return;

      }

      String url = "http://ajax.googleapis.com/ajax/services/search/web?

      start=0&rsz=large&v=1.0&q=" + query;

      List webpages = searchGoogle(url);

      if (webpages != null) {

      int i = 1;

      for (Webpage webpage : webpages) {

      LOG.info("搜索结果 " + (i++) + " :");

      LOG.info("标题:" + webpage.getTitle());

      LOG.info("URL:" + webpage.getUrl());

      LOG.info("摘要:" + webpage.getSummary());

      LOG.info("正文:" + webpage.getContent());

      LOG.info("");

      }

      } else {

      LOG.error("没有搜索到结果");

      }

      }

      }

      疯狂软件java培训、ios培训新年钜惠,报名Java就业班免费赠送java基础班,报名iOS就业班免费赠送iOS基础班,本月火速抢座中,为回报广大新老学员,值此新年之际推出报读就业班赠送基础班的活动

      -----------------------------------2014年初活动--------------------------------------

      疯狂软件Java学习班方向:

      1.报读JavaEE就业班赠送基础班课程。

      2.开班一次性(一个月内)交清JavaEE就业班学费,赠送基础班全套课程。

      疯狂软件iOS学习班方向:

      1.报读iOS应用+手游就业班赠送iOS基础班课程。

      2.开班一次性(一个月内)交清iOS应用+手游就业班就业班学费,赠送基础班全套课程。

  • 相关阅读:
    Python采用struct处理二进制
    OVS处理upcall流程分析
    mybatis在CRUD
    leetcode先刷_Valid Sudoku
    [TS] Implement a doubly linked list in TypeScript
    [TS] Implement a singly linked list in TypeScript
    [Python] The get() method on Python dicts and its "default" arg
    [Javascript AST] 4. Continue: Report ESLint error
    [RxJS] Learn How To Use RxJS 5.5 Beta 2
    [NPM] Update published npm packages using np
  • 原文地址:https://www.cnblogs.com/gojava/p/3528497.html
Copyright © 2011-2022 走看看