今天对中图分类号的网站进行了解析,按照层级进行遍历,为解决普通类无法调用AutoWired,采用test注解来解决。
@Test
public void testCode() throws Exception{
String url = "https://www.clcindex.com/category/T/";
Stack<String> stack = new Stack<>();
//打开浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
System.out.println(url);
//输入网址
HttpGet httpGet = new HttpGet(url);
//按回车,发起请求,返回响应,使用httpClient对象发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//解析响应,获取数据
if (response.getStatusLine().getStatusCode() == 200) {
String content = EntityUtils.toString(response.getEntity(), "UTF-8");
// System.out.println(content);
//解析字符串
Document document = Jsoup.parse(content);
Elements elements = document.getElementsByAttributeValue("name", "item-row");
if (elements != null) {
for (Element element : elements) {
Elements tds = element.select("td");
// System.out.println(tds);
int count = 0;
CLCNumber clcNumber = new CLCNumber();
for (Element td : tds) {
if (count == 1) {
clcNumber.setName(td.text());
System.out.println(td.text());
}
if (count == 2) {
clcNumber.setContent(td.text());
System.out.println("href:");
System.out.println(td);
String href = td.select("a").attr("href");
href = href.replace("[", "%5B");
href = href.replace("]", "%5D");
href = href.replace("{", "%7B");
href = href.replace("}", "%7D");
System.out.println(href);
System.out.println("https://www.clcindex.com" + href);
stack.push("https://www.clcindex.com" + href);
System.out.println(td.text());
}
count++;
}
String parent = url.replace("https://www.clcindex.com/category/", "");
if (parent.endsWith("/")) {
parent = parent.substring(0, parent.length() - 1);
}
if (url.equals("https://www.clcindex.com/category/")) {
String label = 1 + "";
clcNumber.setLabel(label);
} else {
}
clcNumber.setParent(parent);
// System.out.println(element);
}
}
}
}