zoukankan      html  css  js  c++  java
  • 使用hash拆分文件

    package readImgUrl;
    
    import java.io.BufferedInputStream;
    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.io.OutputStreamWriter;
    import java.net.URL;
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.Collections;
    import java.util.Comparator;
    import java.util.List;
    
    public class ClassifyUrl {
    	
    	private static int HASHLEN = 100;
    	
    	private static String file_dir = "D:\学习\实验室项目\ImageNet图片爬取\classify_url\";
    	
    	private static String src_file = "D:\学习\实验室项目\ImageNet图片爬取\fall11_urls.txt";
    	
    	public static void main(String[] args) throws Exception {
    		// TODO Auto-generated method stub
    		classify_url("D:\学习\实验室项目\ImageNet图片爬取\fall11_urls.txt");
    //		rank_filedata("2");
    		
    //		String s = judgeFileCode(src_file);
    //		String s = codeString(src_file);
    //		System.out.println(s);
    	}
    
    	/**
    	 * 对一个文件进行排序
    	 */
    	public static void rank_filedata(String filename){
    		String path1 = file_dir+filename+".txt";
    		String path2 = file_dir+filename+"_"+".txt";
    		List<String> list = reader_list(path1);
    		System.out.println(list.size());
    		// 排序,通过泛型和匿名类来实现  
            Collections.sort(list, new Comparator<String>() {  
                public int compare(String s1, String s2) {
                	String h1 = s1.split("	")[1];
                	String h2 = s2.split("	")[1];
                	return h1.compareTo(h2);
                }  
            });
    		writer_list(list, path2);
    	}
    	/**
    	 * 读取文件,返回list
    	 * @param path
    	 * @return
    	 */
    	public static List reader_list(String path){
    		List<String> lineList = new ArrayList();
    		try {
    			BufferedReader reader = new BufferedReader(new FileReader(path));
    			String line = reader.readLine();
    			while(null != line){
    				lineList.add(line);
    				line = reader.readLine();
    			}
    			reader.close();
    			return lineList;
    		} catch (Exception e) {
    			// TODO: handle exception
    			e.printStackTrace();
    		}
    		return null;
    	}
    	/**
    	 * 将List写入文件
    	 * @param line
    	 */
    	public static void writer_list(List list, String path){
    		try {
    			BufferedWriter writer = new BufferedWriter(new FileWriter(path));
    			for(int i=0; i<list.size(); i++){
    				String line = (String)list.get(i);
    				writer.write(line+"
    ");
    			}			
    			writer.close();
    			
    		} catch (Exception e) {
    			// TODO: handle exception
    			e.printStackTrace();
    		}
    	}
    	/**
    	 * 从文件中逐行读取数据,分类写入0-99个文件
    	 */
    	public static void classify_url(String path){
    		try {
    			BufferedReader reader ;
    			String filecode = judgeFileCode(path);
    			reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),filecode));
    //			BufferedReader reader = new BufferedReader(new FileReader(path));
    			String line = reader.readLine();
    			int line_num = 0;
    //			while(line_num<4101000){
    //				reader.readLine();
    //				line_num++;
    //			}
    			while(null != line){
    				try {
    					String host = new URL(line.split("	")[1]).getHost();
    					int type = hash(host.toCharArray());
    //					writer(type+"", line);
    				} catch (Exception e) {
    					// TODO: handle exception
    					e.printStackTrace();
    				}
    				line = reader.readLine();
    				line_num++;
    				if(line_num%100==0){
    //					System.out.println(line_num);
    					char [] cc = line.toCharArray();
    					for(char c: cc){
    						if(isCnorEn(c)){
    							System.out.println(line);
    							break;
    						}
    					}
    //					break;
    				}
    			}
    			reader.close();
    		} catch (Exception e) {
    			// TODO: handle exception
    			e.printStackTrace();
    		}
    	}
    	/**
    	 * 判断是中文还是英文字符
    	 */
    	static boolean isCnorEn(char c) {
    		if ((c >= 0x0391 && c <= 0xFFE5) // 中文字符
    				|| (c >= 0x0000 && c <= 0x00FF)) // 英文字符
    			return true;
    		return false;
    //		if ((c >= 0x0391 && c <= 0xFFE5) // 英文字符
    //				) // 
    //			return true;
    //		return false;
    	}
    	/**
    	 * 给定一个字符串,返回hash后的int值
    	 * @param word
    	 * @return
    	 */
    	public static int hash(char[] word) {
    		int index = 0; 
    	    int i=0;
    	    while(i<word.length) {
    	        index += index * 31 + word[i];
    	        i++;
    	    }
    	    return Math.abs(index % HASHLEN);
    	} 
    	/**
    	 * 将line写入filename中(文件不存在则先建立)
    	 * @param filename
    	 * @param line
    	 */
    	public static void writer(String filename, String line){
    		String path = file_dir+filename+".txt";
    		try {
    			File file = new File(path);
    			if(!file.isFile()){
    				file.createNewFile();
    			}
    			String filecode = judgeFileCode(src_file);
    			OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(path, true), "GBK");
    //			BufferedWriter writer = new BufferedWriter(new FileWriter(path, true));
    			if(null != line){
    				writer.write(line+"
    ");
    			}
    			writer.close();
    			
    		} catch (Exception e) {
    			// TODO: handle exception
    			e.printStackTrace();
    		}
    	}
    	
    	public static String judgeFileCode(String path){
    		try {
    			File file = new File(path);  
    			InputStream in= new java.io.FileInputStream(file);  
    			byte[] b = new byte[3];  
    			in.read(b);  
    			in.close();  
    			if (b[0] == -17 && b[1] == -69 && b[2] == -65)  {
    //				System.out.println(file.getName() + ":编码为UTF-8");
    				return "UTF-8";
    			}
    			else{
    //				System.out.println(file.getName() + ":可能是GBK,也可能是其他编码");
    				return "GBK";
    			}
    		} catch (Exception e) {
    			// TODO: handle exception
    		}
    		return null;
    	}
    
    	/**
         * 判断文件的编码格式
         * @param fileName :file
         * @return 文件编码格式
         * @throws Exception
         */
        public static String codeString(String fileName) throws Exception{
            BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));
            int p = (bin.read() << 8) + bin.read();
            String code = null;
            //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
            switch (p) {
                case 0xefbb:
                    code = "UTF-8";
                    break;
                case 0xfffe:
                    code = "Unicode";
                    break;
                case 0xfeff:
                    code = "UTF-16BE";
                    break;
                case 0x5c75:
                    code = "ANSI|ASCII" ;
                    break ;
                default:
                    code = "GBK";
            }
             
            return code;
        }
    
    }
    

  • 相关阅读:
    MySQL动态添删改列字段
    关于javascript在子页面中函数无法调试问题的解决
    <T> T[] toArray(T[] a);
    MERGE INTO
    eclipse不能新建server
    关于tomcat7下websocket不能使用
    myeclipse启动tomcat报错cannot find a free socket for debugger
    checkbox提交多组数据到action
    Struts2 Action中的方法命名不要以get开头
    浅谈C#中的接口和抽象类
  • 原文地址:https://www.cnblogs.com/yan456jie/p/5369542.html
Copyright © 2011-2022 走看看