package download; import org.json.JSONArray; import java.io.*; import java.net.URL; import java.net.URLConnection; import java.nio.Buffer; import java.text.SimpleDateFormat; import java.util.*; import org.json.JSONObject; public class Downloader { public static void main(String[] args) throws IOException { // Configer.configProxy(); System.out.println("爬取完成,条数:"+getresult().size()); } public static String indexstr = ""; public static Properties p; public static Properties loadPropertiesFromFile(String filename) throws IOException { Properties p = new Properties(); InputStream input = Downloader.class.getClassLoader().getResourceAsStream(filename); p.load(input); return p; } static { try { p = loadPropertiesFromFile("downloader.properties"); } catch (IOException e) { System.out.println("downloader.properties读取失败"); e.printStackTrace(); } } public static InputStream get_whitelist_inputstream(){ //获取配置文件的inputstream ClassLoader classLoader=Downloader.class.getClassLoader(); InputStream whitelist_inputstream=classLoader.getResourceAsStream(p.getProperty("white_list_file")); return whitelist_inputstream; //获取配置文件的路径名 // ClassLoader classLoader=Downloader.class.getClassLoader(); // URL resource=classLoader.getResource(p.getProperty("white_list_file")); // String path=resource.getPath(); } public static String get_whitelist_regex() throws IOException { InputStream whitelist_inputstream=get_whitelist_inputstream(); BufferedReader whitelist_reader=new BufferedReader(new InputStreamReader(whitelist_inputstream)); String whitelist_regex=""; String line=null; while((line=whitelist_reader.readLine())!=null){ whitelist_regex+="("+line+")|"; } if(whitelist_regex.length()!=0){ whitelist_regex=whitelist_regex.substring(0,whitelist_regex.length()-1); } whitelist_inputstream.close(); whitelist_reader.close(); return whitelist_regex; } public static String post(String url, String param, Map<String, String> header) throws IOException { PrintWriter out = null; BufferedReader in = null; String result = ""; URL realUrl = new URL(url); // 打开和URL之间的连接 URLConnection conn = realUrl.openConnection(); //设置超时时间 conn.setConnectTimeout(5000); conn.setReadTimeout(15000); // 设置通用的请求属性 if (header != null) { for (Map.Entry<String, String> entry : header.entrySet()) { conn.setRequestProperty(entry.getKey(), entry.getValue()); } } conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); // 发送POST请求必须设置如下两行 conn.setDoOutput(true); conn.setDoInput(true); // 获取URLConnection对象对应的输出流 out = new PrintWriter(conn.getOutputStream()); // 发送请求参数 out.print(param); // flush输出流的缓冲 out.flush(); // 定义BufferedReader输入流来读取URL的响应 in = new BufferedReader( new InputStreamReader(conn.getInputStream(), "utf8")); String line; while ((line = in.readLine()) != null) { result += line; } if (out != null) { out.close(); } if (in != null) { in.close(); } return result; } public static String get(String url) throws IOException { BufferedReader in = null; URL realUrl = new URL(url); // 打开和URL之间的连接 URLConnection connection = realUrl.openConnection(); // 设置通用的请求属性 connection.setRequestProperty("accept", "*/*"); connection.setRequestProperty("connection", "Keep-Alive"); connection.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)"); connection.setConnectTimeout(5000); connection.setReadTimeout(5000); // 建立实际的连接 connection.connect(); // 定义 BufferedReader输入流来读取URL的响应 in = new BufferedReader(new InputStreamReader(connection.getInputStream())); StringBuffer sb = new StringBuffer(); String line; while ((line = in.readLine()) != null) { sb.append(line); } in.close(); return sb.toString(); } public static void getIndexStr() { indexstr = "logstash-" + new SimpleDateFormat("yyyy.MM.dd").format(new Date()); //indexstr = "tpot_test";//for test } public static Set<String> getAttackTypeSet() throws IOException { getIndexStr(); String attacktypeurl = p.getProperty("els.host") + "/" + indexstr + "/" + "_mapping?pretty=true"; System.out.println("【getting all types today】>>" + attacktypeurl); String attacktyperesult = get(attacktypeurl); //parse json JSONObject jobj1 = new JSONObject(attacktyperesult); JSONObject jobj2 = jobj1.getJSONObject(indexstr); JSONObject jobj3 = jobj2.getJSONObject("mappings"); return jobj3.keySet(); } public static LinkedList<NearRealtimeIntelligence> getresult() throws IOException { LinkedList<NearRealtimeIntelligence> result = new LinkedList<NearRealtimeIntelligence>(); Set<String> attacktypeset = getAttackTypeSet(); String param = "{ " + " "query": { " + " "bool": { " + " "must_not": [ " + " { " + " "regexp":{ " + " "src_ip":"" + get_whitelist_regex() + "" " + " } " + " } " + " ] " + " } " + " },"size":" + p.getProperty("els.batch_size") + " " + "}"; for (String attacktype : attacktypeset) { //忽略default、syslog两个type if (attacktype.equals("_default_") || attacktype.equals("Syslog")) { continue; } System.out.println("【getting "+attacktype+" data】"); String req = p.getProperty("els.host") + "/" + indexstr + "/" + attacktype + "/_search?scroll=" + p.getProperty("scroll_timegap"); System.out.println("posting url>>" + req); String res = post(req, param, null); //parse json JSONObject res_json = new JSONObject(res); JSONObject all_hits = res_json.getJSONObject("hits"); JSONArray docu_array = all_hits.getJSONArray("hits"); int total = all_hits.getInt("total"); int pages = (int) Math.ceil(total / Double.parseDouble(p.getProperty("els.batch_size"))); System.out.println("数据条数:"+total + " 页数:" + pages); String scroll_id = res_json.getString("_scroll_id"); // System.out.println("######################################batch0"); for (int j = 0; j < docu_array.length(); j++) { JSONObject docu = (JSONObject) docu_array.get(j); JSONObject source = docu.getJSONObject("_source"); if (source.has("src_ip")) { String src_ip = source.getString("src_ip"); System.out.println(src_ip); NearRealtimeIntelligence adata=new NearRealtimeIntelligence(); adata.setName(src_ip); adata.setSourceName(attacktype); result.add(adata); } } for (int i = 1; i < pages; i++) { // System.out.println("######################################batch" + i); req = p.getProperty("els.host") + "/_search/scroll"; // System.out.println("posting url>>" + req); String param_scroll = "{ " + " "scroll":"" + p.getProperty("scroll_timegap") + "", " + " "scroll_id":"" + scroll_id + "" " + "}"; res = post(req, param_scroll, null); //parse json res_json = new JSONObject(res); all_hits = res_json.getJSONObject("hits"); docu_array = all_hits.getJSONArray("hits"); for (int j = 0; j < docu_array.length(); j++) { JSONObject docu = (JSONObject) docu_array.get(j); JSONObject source = docu.getJSONObject("_source"); if (source.has("src_ip")) { String src_ip = source.getString("src_ip"); // System.out.println(src_ip); NearRealtimeIntelligence adata=new NearRealtimeIntelligence(); adata.setName(src_ip); adata.setSourceName(attacktype); result.add(adata); } } } } return result; } }
拉取过程中,注意:
1、请求参数中过滤掉白名单+设置大小分页读取
url: http://xxx.xxx.xxx.xxx:8000/logstash-2018.07.30/Honeytrap/_search?scroll=3m
String param = "{ " + " "query": { " + " "bool": { " + " "must_not": [ " + " { " + " "regexp":{ " + " "src_ip":"" + get_whitelist_regex() + "" " + " } " + " } " + " ] " + " } " + " },"size":" + p.getProperty("els.batch_size") + " " + "}";
2、读取文件
获得inputstream
ClassLoader classLoader=Downloader.class.getClassLoader(); InputStream whitelist_inputstream=classLoader.getResourceAsStream(p.getProperty("white_list_file"));
使用inputstream按行读
BufferedReader whitelist_reader=new BufferedReader(new InputStreamReader(whitelist_inputstream)); String line=null; while((line=whitelist_reader.readLine())!=null){ }
3、读取文件
Properties p = new Properties(); InputStream input = Downloader.class.getClassLoader().getResourceAsStream(filename); p.load(input);
4、解析json字符串
JSONObject res_json = new JSONObject(res); JSONObject all_hits = res_json.getJSONObject("hits"); JSONArray docu_array = all_hits.getJSONArray("hits");