package readImgUrl; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; public class ClassifyUrl { private static int HASHLEN = 100; private static String file_dir = "D:\学习\实验室项目\ImageNet图片爬取\classify_url\"; private static String src_file = "D:\学习\实验室项目\ImageNet图片爬取\fall11_urls.txt"; public static void main(String[] args) throws Exception { // TODO Auto-generated method stub classify_url("D:\学习\实验室项目\ImageNet图片爬取\fall11_urls.txt"); // rank_filedata("2"); // String s = judgeFileCode(src_file); // String s = codeString(src_file); // System.out.println(s); } /** * 对一个文件进行排序 */ public static void rank_filedata(String filename){ String path1 = file_dir+filename+".txt"; String path2 = file_dir+filename+"_"+".txt"; List<String> list = reader_list(path1); System.out.println(list.size()); // 排序,通过泛型和匿名类来实现 Collections.sort(list, new Comparator<String>() { public int compare(String s1, String s2) { String h1 = s1.split(" ")[1]; String h2 = s2.split(" ")[1]; return h1.compareTo(h2); } }); writer_list(list, path2); } /** * 读取文件,返回list * @param path * @return */ public static List reader_list(String path){ List<String> lineList = new ArrayList(); try { BufferedReader reader = new BufferedReader(new FileReader(path)); String line = reader.readLine(); while(null != line){ lineList.add(line); line = reader.readLine(); } reader.close(); return lineList; } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } return null; } /** * 将List写入文件 * @param line */ public static void writer_list(List list, String path){ try { BufferedWriter writer = new BufferedWriter(new FileWriter(path)); for(int i=0; i<list.size(); i++){ String line = (String)list.get(i); writer.write(line+" "); } writer.close(); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } /** * 从文件中逐行读取数据,分类写入0-99个文件 */ public static void classify_url(String path){ try { BufferedReader reader ; String filecode = judgeFileCode(path); reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),filecode)); // BufferedReader reader = new BufferedReader(new FileReader(path)); String line = reader.readLine(); int line_num = 0; // while(line_num<4101000){ // reader.readLine(); // line_num++; // } while(null != line){ try { String host = new URL(line.split(" ")[1]).getHost(); int type = hash(host.toCharArray()); // writer(type+"", line); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } line = reader.readLine(); line_num++; if(line_num%100==0){ // System.out.println(line_num); char [] cc = line.toCharArray(); for(char c: cc){ if(isCnorEn(c)){ System.out.println(line); break; } } // break; } } reader.close(); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } /** * 判断是中文还是英文字符 */ static boolean isCnorEn(char c) { if ((c >= 0x0391 && c <= 0xFFE5) // 中文字符 || (c >= 0x0000 && c <= 0x00FF)) // 英文字符 return true; return false; // if ((c >= 0x0391 && c <= 0xFFE5) // 英文字符 // ) // // return true; // return false; } /** * 给定一个字符串,返回hash后的int值 * @param word * @return */ public static int hash(char[] word) { int index = 0; int i=0; while(i<word.length) { index += index * 31 + word[i]; i++; } return Math.abs(index % HASHLEN); } /** * 将line写入filename中(文件不存在则先建立) * @param filename * @param line */ public static void writer(String filename, String line){ String path = file_dir+filename+".txt"; try { File file = new File(path); if(!file.isFile()){ file.createNewFile(); } String filecode = judgeFileCode(src_file); OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(path, true), "GBK"); // BufferedWriter writer = new BufferedWriter(new FileWriter(path, true)); if(null != line){ writer.write(line+" "); } writer.close(); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } public static String judgeFileCode(String path){ try { File file = new File(path); InputStream in= new java.io.FileInputStream(file); byte[] b = new byte[3]; in.read(b); in.close(); if (b[0] == -17 && b[1] == -69 && b[2] == -65) { // System.out.println(file.getName() + ":编码为UTF-8"); return "UTF-8"; } else{ // System.out.println(file.getName() + ":可能是GBK,也可能是其他编码"); return "GBK"; } } catch (Exception e) { // TODO: handle exception } return null; } /** * 判断文件的编码格式 * @param fileName :file * @return 文件编码格式 * @throws Exception */ public static String codeString(String fileName) throws Exception{ BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName)); int p = (bin.read() << 8) + bin.read(); String code = null; //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数 switch (p) { case 0xefbb: code = "UTF-8"; break; case 0xfffe: code = "Unicode"; break; case 0xfeff: code = "UTF-16BE"; break; case 0x5c75: code = "ANSI|ASCII" ; break ; default: code = "GBK"; } return code; } }