上次学了jsoup之后,发现一些动态生成的网页内容是无法抓取的,于是又学习了htmlunit,下面是抓取酷狗音乐与qq音乐链接的例子:
酷狗音乐:
import java.io.BufferedInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.net.URL; import java.net.URLEncoder; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.nodes.Element; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.WebClient; public class worm7 { private static String name="离骚"; public static WebClient getWebClient(boolean flag){ WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45); webClient.getOptions().setUseInsecureSSL(true); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setRedirectEnabled(true); webClient.getOptions().setAppletEnabled(false); webClient.getOptions().setJavaScriptEnabled(flag); webClient.getOptions().setTimeout(60000); webClient.getOptions().setPrintContentOnFailingStatusCode(false); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); return webClient; } public static String getMp3Url(WebClient webClient){ FileOutputStream outputStream = null; InputStream inputStream = null; BufferedInputStream bis = null; try { Page page=webClient.getPage("http://songsearch.kugou.com/song_search_v2?" + "callback=jQuery112408395432201569397_1532930925600" + "&keyword="+URLEncoder.encode(name, "utf-8") + "&page=1" + "&pagesize=30" + "&userid=-1" + "&clientver=" + "&platform=WebFilter" + "&tag=em" + "&filter=2" + "&iscorrection=1" + "&privilege_filter=0" + "&_="+System.currentTimeMillis()); //System.out.println(page.getWebResponse().getContentAsString()); //System.out.println(zzee(page.getWebResponse().getContentAsString(),"(?<=\(\{).*?(?=\}\))")); JSONObject job=JSONObject.parseObject("{"+zzee(page.getWebResponse().getContentAsString(),"(?<=\(\{).*?(?=\}\))")+"}").getJSONObject("data"); System.out.println("job:"+job); JSONArray list=job.getJSONArray("lists"); System.out.println("list"+list); for(int i=0;i<list.size();i++){ String id1=list.getJSONObject(i).getString("FileHash"); String id2=list.getJSONObject(i).getString("AlbumID"); String detailUrl="http://www.kugou.com/yy/index.php?r=play/getdata" + "&hash="+id1 + "&album_id="+id2 + "&_="+System.currentTimeMillis(); Page page2=webClient.getPage(detailUrl); JSONObject job2=JSONObject.parseObject(page2.getWebResponse().getContentAsString()).getJSONObject("data"); System.out.println("标题:"+job2.getString("audio_name")); //System.out.println("歌词:"+job2.getString("lyrics")); System.out.println("mp3:"+job2.getString("play_url")); String outImage = job2.getString("audio_name")+ ".mp3"; URL imgUrl = new URL(job2.getString("play_url"));//获取输入流 inputStream = imgUrl.openConnection().getInputStream(); //将输入流信息放入缓冲流提升读写速度 bis = new BufferedInputStream(inputStream); //读取字节娄 byte[] buf = new byte[1024]; //生成文件 outputStream = new FileOutputStream("f://"+ outImage); int size = 0; //边读边写 while ((size = bis.read(buf)) != -1) { outputStream.write(buf, 0, size); } //刷新文件流 outputStream.flush(); } } catch (Exception e) { e.printStackTrace(); } return name; } private static String zzee(String str, String zz) { String list = null; Pattern p = Pattern.compile(zz); Matcher m = p.matcher(str); while (m.find()) { list = m.group(); } return list; } public static void main(String[] args) { WebClient webClient=getWebClient(false); getMp3Url(webClient); } }
运行结果:
qq音乐抓取实例:
import java.io.BufferedInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.nodes.Element; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.WebClient; public class worm6 { private static String name="离骚"; static String id1=null; static String id2=null; static String id3=null; static String id4=null; static String name1=null; static String name2=null; static String url = null; static JSONObject job2=null; public static WebClient getWebClient(boolean flag){ WebClient webClient = new WebClient(BrowserVersion.FIREFOX_45); webClient.getOptions().setUseInsecureSSL(true); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setRedirectEnabled(true); webClient.getOptions().setAppletEnabled(false); webClient.getOptions().setJavaScriptEnabled(flag); webClient.getOptions().setTimeout(60000); webClient.getOptions().setPrintContentOnFailingStatusCode(false); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); return webClient; } public static String getMp3Url(WebClient webClient){ try { Page page=webClient.getPage("https://c.y.qq.com/soso/fcgi-bin/client_search_cp?" + "ct=24" + "&qqmusic_ver=1298" + "&new_json=1" + "&remoteplace=txt.yqq.center" + "&searchid=36047978388657978" + "&t=0" + "&aggr=1" + "&cr=1" + "&catZhida=1" + "&lossless=0" + "&p=1" + "&n=20" + "&w="+URLEncoder.encode(name, "utf-8") + "&g_tk=5381" + "&jsonpCallback=MusicJsonCallback6176591962889693" + "&loginUin=0" + "&hostUin=0" + "&format=jsonp" + "&inCharset=utf8" + "&outCharset=utf-8" + "¬ice=0" + "&platform=yqq" + "&needNewCode=0" ); //System.out.println("page:"+page); //System.out.println("------"+page.getWebResponse().getContentAsString()); //System.out.println("======"+zzee(page.getWebResponse().getContentAsString(),"(?<=\(\{).*?(?=\}\))")); JSONObject job=JSONObject.parseObject("{"+zzee(page.getWebResponse().getContentAsString(),"(?<=\(\{).*?(?=\}\))")+"}").getJSONObject("data"); //System.out.println("job:"+job); String job0=job.getString("song"); //System.out.println("job0"+job0); job=JSON.parseObject(job0); JSONArray list=job.getJSONArray("list"); //System.out.println("list:"+list); for(int i=0;i<list.size();i++){ id1=list.getJSONObject(i).getString("mid"); //System.out.println("id1"+id1); id2=list.getJSONObject(i).getString("file"); //System.out.println("id"+id2); id2="C400"+JSONObject.parseObject(id2).getString("media_mid")+".m4a"; //System.out.println("id"+id2); name1=list.getJSONObject(i).getString("title"); name2=list.getJSONObject(i).getString("singer"); //System.out.println(name2); JSONArray name=JSON.parseArray(name2); //System.out.println("job4:"+name); name2=name.getJSONObject(0).getString("name"); //System.out.println(name.getJSONObject(0).getString("name")); /*String detailUrl="https://c.y.qq.com/v8/fcg-bin/fcg_play_single_song.fcg?" + "songmid="+id1 + "&tpl=yqq_song_detail&format=jsonp&callback=getOneSongInfoCallback&g_tk=5381&jsonpCallback=getOneSongInfoCallback&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0" ; Page page2=webClient.getPage(detailUrl); //System.out.println(page2); String b="{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\(\{).*?(?=\}\))")+"}"; //System.out.println("b"+b); JSONObject job1=JSONObject.parseObject("{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\(\{).*?(?=\}\))")+"}").getJSONObject("url"); System.out.println("job1:"+job1); String job2=job1.getString(id2); System.out.println("job2"+job2);*/ String url1="https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg?g_tk=5381&jsonpCallback=MusicJsonCallback32651599216689386&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq&needNewCode=0&cid=205361747&callback=MusicJsonCallback32651599216689386&uin=0" +"&songmid="+id1 +"&filename="+id2 +"&guid=2241489759"; ; Page page2=webClient.getPage(url1); //System.out.println("page2"+page2); JSONObject job2=JSONObject.parseObject("{"+zzee(page2.getWebResponse().getContentAsString(),"(?<=\(\{).*?(?=\}\))")+"}").getJSONObject("data"); //System.out.println("标题:"+job2.getString("items")); String job3=job2.getString("items"); JSONArray job4=JSON.parseArray(job3); //System.out.println("job4:"+job4); //System.out.println(job4.getJSONObject(0).getString("vkey")); url ="http://dl.stream.qqmusic.qq.com/"+id2+"?vkey="+job4.getJSONObject(0).getString("vkey")+"&guid=2241489759&uin=0&fromtag=66"; System.out.println("name:"+name1+"--"+name2); System.out.println("url:"+url); download(); } } catch (Exception e) { e.printStackTrace(); } return name; } private static String zzee(String str, String zz) { String list = null; Pattern p = Pattern.compile(zz); Matcher m = p.matcher(str); while (m.find()) { list = m.group(); } return list; } private static void download() throws IOException{ FileOutputStream outputStream = null; InputStream inputStream = null; BufferedInputStream bis = null; String outImage = name1+"--"+name2+ ".mp3"; URL imgUrl = new URL(url);//获取输入流 inputStream = imgUrl.openConnection().getInputStream(); //将输入流信息放入缓冲流提升读写速度 bis = new BufferedInputStream(inputStream); //读取字节娄 byte[] buf = new byte[1024]; //生成文件 outputStream = new FileOutputStream("f://"+ outImage); int size = 0; //边读边写 while ((size = bis.read(buf)) != -1) { outputStream.write(buf, 0, size); } //刷新文件流 outputStream.flush(); } public static void main(String[] args) { WebClient webClient=getWebClient(false); getMp3Url(webClient); } }
运行结果:
相比之下,酷狗音乐相对好爬一些,QQ音乐有些繁琐。。。