java开发爬虫Deno
身为一个程序员不会两三手爬虫怎么能在行业里立足啊,这是开发中自己写的一个java爬虫的Demo,供大家参考。
java爬虫的开发依赖于jsoup.jar
直接上代码
public static void main(String[] args) { HttpClient client = new DefaultHttpClient(); HttpGet httpGet = new HttpGet(); Map map = new HashMap(); try { //百度百科999感冒灵连接 String url = "http://baike.baidu.com/link?url=c95Y4QJym_d_wFKGmcibRTI_KIyj-X_tOjnlOGJS9qekgO1tmWaWnrn7QyAjqvZX8At7LbI1XIa69IBZWejiCXDVM0jkSBMnVZKKu4jeg-ef4TJkKCXEXWcGJ8DRGTuHxW4qWB3pNNU7Y0KdrbNvGK"; // get请求获取页面信息 String bb = doget(url); Document doc; //用jsoup接收页面信息 doc = Jsoup.parse(bb); // 选择所有div的class为para的标签 Elements news = doc.select("div[class=para]"); for (Element result : news) { //获取标签的内容并打印 String str=result.text(); System.out.println(str); } } catch (Exception e) { e.printStackTrace(); } } public static String doget(String path) { InputStream is = null; ByteArrayOutputStream baos = null; try { // 伪造referer 绕过防盗链设置 URL url = new URL(path.trim()); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); if (200 == conn.getResponseCode()) { byte[] buff = new byte[4096]; int count; ByteArrayOutputStream out = new ByteArrayOutputStream(4096); InputStream in = conn.getInputStream(); while ((count = in.read(buff)) != -1) { out.write(buff, 0, count); } conn.disconnect(); return out.toString("UTF-8"); } } catch (Exception e) { e.printStackTrace(); } finally { if (baos != null) { try { baos.close(); } catch (IOException e) { e.printStackTrace(); } } if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } return null; }
转载注明引用