本周学习了java爬虫的相关知识。
代码量500 博客数1
一、Get请求
public class Web { static final Log logger = LogFactory.getLog(Web.class); public static void main(String[] args) throws Exception { //1、打开浏览器,创建httpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //创建URIBuilder URIBuilder uribuilder= new URIBuilder("https://www.qidian.com"); //设置参数:参数名+参数值,可设置多个 uribuilder.setParameter("key","xuanhuan").setParameter("", ""); //2、输入网址,发起请求,创建httpGet对象 HttpGet httpGet= new HttpGet(uribuilder.build()); System.out.println("发起请求的信息:"+httpGet); CloseableHttpResponse response=null; try { //3、按回车,发起请求,返回响应,使用httpClient对象发起请求 response = httpClient.execute(httpGet); //解析响应,获取数据 //判断状态码是否为两百 if(response.getStatusLine().getStatusCode()==200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf8"); System.out.println(content.length()); System.out.println(content); } }catch(Exception e) { e.printStackTrace(); }finally { try { //关闭response response.close(); //关闭httpClient httpClient.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
二、Post请求
public class Web { static final Log logger = LogFactory.getLog(Web.class); public static void main(String[] args) throws Exception { //1、打开浏览器,创建httpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //2、输入网址,发起请求,创建httpPost对象 HttpPost httpPost= new HttpPost("https://www.baidu.com/index.php"); System.out.println("发起请求的信息:"+httpPost); //Post使用,声明List集合,封装表单中的参数 List<NameValuePair> params= new ArrayList<NameValuePair>(); params.add(new BasicNameValuePair("","")); //创建表单的Entity对象,第一个参数是封装好的参数,第二个是编码 UrlEncodedFormEntity formEntity= new UrlEncodedFormEntity(params,"utf8"); //设置表单的Entity对象到Post请求中 httpPost.setEntity(formEntity); CloseableHttpResponse response=null; try { //3、按回车,发起请求,返回响应,使用httpClient对象发起请求 response = httpClient.execute(httpPost); //解析响应,获取数据 //判断状态码是否为两百 if(response.getStatusLine().getStatusCode()==200) { HttpEntity httpEntity = response.getEntity(); String content = EntityUtils.toString(httpEntity, "utf8"); System.out.println(content.length()); // System.out.println(content); }else { System.out.println("请求失败"+response); } }catch(Exception e) { e.printStackTrace(); }finally { try { //关闭response response.close(); //关闭httpClient httpClient.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
三、Jsoup解析HTML获取DOM
public class Jsouputil { public static void main(String[] args) throws Exception { testUrl(); testString(); } /** * 解析URL * @throws Exception */ public static void testUrl() throws Exception { //解析URL,第一个参数是URL,第二个是访问的超时时间 Document doc = Jsoup.parse(new URL("https://www.qidian.com"), 1000); //使用标签选择器,获取title标签里的内容 String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } /** * 解析字符串 */ public static void testString() throws Exception { HttpClientPool httpClient =new HttpClientPool(); //创建连接池管理器 PoolingHttpClientConnectionManager cm =new PoolingHttpClientConnectionManager(); //获取网页HTML字符串 String content=httpClient.doGet(cm); //解析字符串 Document doc = Jsoup.parse(content); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } }
四、DOM的方式获取元素
public static void testDom()throws Exception{ //获取Document对象 HttpClientPool httpClient =new HttpClientPool(); //创建连接池管理器 PoolingHttpClientConnectionManager cm =new PoolingHttpClientConnectionManager(); //获取网页HTML字符串 String content=httpClient.doGet(cm); //解析字符串 Document doc = Jsoup.parse(content); // 1、根据id查询元素getElementById Element elementById = doc.getElementById("overseas_tit"); System.out.println(elementById.text()); // 2、根据标签获取元素getElementsByTag Elements elementsByTag = doc.getElementsByTag("span"); System.out.println(elementsByTag.text()); // 3、根据class获取元素getElementsByClass Elements elementsByClass = doc.getElementsByClass("chart_table_th"); System.out.println(elementsByClass.text()); // 4、根据属性获取元素getElementsByAttribute Elements elementsByAttribute = doc.getElementsByAttribute("src"); Elements elementsByAttributeValue = doc.getElementsByAttributeValue("class", "chart_table_name"); System.out.println(elementsByAttribute); System.out.println(elementsByAttributeValue.text()); }
五、获取元素中的数据
public static void testData()throws Exception{ //获取Document对象 HttpClientPool httpClient =new HttpClientPool(); //创建连接池管理器 PoolingHttpClientConnectionManager cm =new PoolingHttpClientConnectionManager(); //获取网页HTML字符串 String content=httpClient.doGet(cm); //解析字符串 Document doc = Jsoup.parse(content); Element element = doc.getElementById("overseas_tit"); System.out.println(element); String str=null; //获取元素中的内容 // //获取id // str=element.id(); // System.out.println("id:"+str); // //获取className // str=element.className(); // Set<String> classSet=element.classNames(); // for(String s:classSet) { // System.out.println(s); // } // System.out.println("className:"+str); // //获取属性的值attr // str=element.attr("class"); // System.out.println(str); //获取所有属性attributes Attributes attributes = element.attributes(); System.out.println(attributes.toString()); //获取文本内容 str=element.text(); System.out.println(str); }
六、Selector选择器获取元素
/** * 使用Selector选择器获取元素 */ public static void testSelector()throws Exception{ //获取Document对象 HttpClientPool httpClient =new HttpClientPool(); //创建连接池管理器 PoolingHttpClientConnectionManager cm =new PoolingHttpClientConnectionManager(); //获取网页HTML字符串 String content=httpClient.doGet(cm); //解析字符串 Document doc = Jsoup.parse(content); // //tagName,通过标签查找元素 // Elements elements = doc.select("span"); // for(Element element:elements) { // System.out.println(element.text()); // } // // //#id,通过id查找 // Element e = doc.select("#overseas_tit").first(); // System.out.println(e.text()); // // // //.class,通过class查找 // Element element = doc.select(".chart_table_name").first(); // System.out.println(element.text()); // // // //[attribute],利用属性获取 // Element element = doc.select("[class]").first(); // // System.out.println(element.text()); //[attr=value],利用属性获取 Elements element = doc.select("[class=chart_table_name]"); System.out.println(element.text()); //el#id:元素+id,h3#city_bj //el.class:元素+class //el[attr]:元素+属性名 //任意组合 //ancestor child:查找某个元素下子元素 //parent > child:查找某个父元素下的直接子元素 //parent > *:查找某个父元素下的所有子元素 }