zoukankan      html  css  js  c++  java
  • Combine small files to Sequence file

    Combine small files to sequence file or avro files are a good method to feed hadoop.

    Small files in hadoop will take more namenode memory resource.

    SequenceFileInputFormat 是一种Key value 格式的文件格式。

    Key和Value的类型可以自己实现其序列化和反序列化内容。

    SequenceFile示例内容:

    image

    其默认的key,value之间的分隔符 是01,这个与hive文件的存储格式是匹配的,这样也方便直接把这种文件加载到hive里面。

    以下的代码仅供参考作用,真实的项目中使用的时候,可以做适当的调整,以更高地节约资源和满足项目的需要。

    示例代码如下:

    package myexamples;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.HashMap;
    import java.util.Map;
    
    import org.apache.commons.io.FileUtils;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.SequenceFile;
    import org.apache.hadoop.io.Text;
    
    public class localf2seqfile {
    /*
     * Local folder has a lot of txt file
     * we need handle it in map reduce
     * so we want to load these files to one sequence file
     * key: source file name
     * value: file content
     * */
    	static void write2Seqfile(FileSystem fs,Path hdfspath,HashMap<Text,Text> hm)  
    	{
    	    SequenceFile.Writer writer=null;  
    	      
    	    try {  
    	      writer=SequenceFile.createWriter(fs, fs.getConf(), hdfspath, Text.class, Text.class);  
    	      
    	      for(Map.Entry<Text,Text> entry:hm.entrySet())    
    	        writer.append(entry.getKey(),entry.getValue());  
    	      
    	    } catch (IOException e) {  
    	    	e.printStackTrace();  
    	    }finally{  
    	    	try{writer.close();}catch(IOException ioe){}
    	    }  
    	}
    	static HashMap<Text,Text> collectFiles(String localpath) throws IOException
    	{
    		HashMap<Text,Text> hm = new HashMap<Text,Text>();
    		File f = new File(localpath);
    		if(!f.isDirectory()) return hm;
    		for(File file:f.listFiles())
    			hm.put(new Text(file.getName()), new Text(FileUtils.readFileToString(file)));
    		
    		return hm;
    	}
    	static void readSeqFile(FileSystem fs, Path hdfspath) throws IOException
    	{
    		 SequenceFile.Reader reader = new SequenceFile.Reader(fs,hdfspath,fs.getConf());  
    		         Text key = new Text();  
    		         Text value = new Text();  
    		         while (reader.next(key, value)) {  
    		        	 System.out.print(key +" ");  
    		        	 System.out.println(value);  
    		         }  
    		         reader.close();
    		
    	}
    	public static void main(String[] args) throws IOException {
    		args = "/home/hadoop/test/sub".split(" ");
    		
    		Configuration conf = new Configuration();
    		conf.set("fs.default.name", "hdfs://namenode:9000");
    		
    		FileSystem fs = FileSystem.get(conf);
    		System.out.println(fs.getUri()); 
    		Path file = new Path("/user/hadoop/seqfiles/seqdemo.seq");
    		if (fs.exists(file)) fs.delete(file,false);
    		HashMap<Text,Text> hm = collectFiles(args[0]);
    		write2Seqfile(fs,file,hm);
    		readSeqFile(fs,file);
    		
    		fs.close();
    	}
    }
    Looking for a job working at Home about MSBI
  • 相关阅读:
    硬盘坏道及后续的拯救工作
    Extension GL_VERSION_1_2 could not be loaded.
    js显示当前的年月日时分秒
    如何删除桌面上的回收站?
    Fedora 15 U盘 安装心得
    庄子·内篇·逍遥游
    Additional Oracle Performance Extensions
    Features Specific to JDBC OCI Driver
    OracleDatabase 配置
    vim字符编码设置
  • 原文地址:https://www.cnblogs.com/huaxiaoyao/p/4297353.html
Copyright © 2011-2022 走看看