zoukankan      html  css  js  c++  java
  • storm实战之WordCount


    一,环境搭建

      eclipse的项目的创键和jar包的导入。

    二,代码编写

      1,组件spout的代码编写,用来发射数据源。

    package com;
    
    import java.util.Map;
    import java.util.Random;
    import org.apache.storm.spout.SpoutOutputCollector;
    import org.apache.storm.task.TopologyContext;
    import org.apache.storm.topology.OutputFieldsDeclarer;
    import org.apache.storm.topology.base.BaseRichSpout;
    import org.apache.storm.tuple.Fields;
    import org.apache.storm.tuple.Values;
    public class RandomSentenceSpout extends BaseRichSpout{
       //用来收集spout的输出tuple
    	private SpoutOutputCollector Collector;
    	//private Random rand;
    	private static final  long SrialversionUID=1l; 
    	
    	@Override
    	public void nextTuple() {
    //	String[] data={"hello zhangsan","nice to meet","you zhangsan hello","lisi welcome to bj"};
    //	Collector.emit(new Values(data[rand.nextInt(data.length-1)]));
    		String[] datas= {"hello zhangsan nice to meet you zhangsan hello lisi welcome to bj"};
    		Values values=new Values(datas[0]);
                //发射的数据
    		Collector.emit(values);
    		try {
    			Thread.sleep(1000);
    		} catch (InterruptedException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    	//初始化操作,只执行一遍
    	@Override
    	public void open(Map conf, TopologyContext context, SpoutOutputCollector Collector ) {
    		this.Collector=Collector;
    	}
            //为发射的数据添加唯一标识,
    	@Override
    	public void declareOutputFields(OutputFieldsDeclarer declarer) {
    		declarer.declare(new Fields("spout"));	
    	}	
    }
    

      2,bolt组件的代码编写,用来切割字段。

    package com;
    
    import java.util.Map;
    import java.util.Random;
    import org.apache.storm.spout.SpoutOutputCollector;
    import org.apache.storm.task.TopologyContext;
    import org.apache.storm.topology.OutputFieldsDeclarer;
    import org.apache.storm.topology.base.BaseRichSpout;
    import org.apache.storm.tuple.Fields;
    import org.apache.storm.tuple.Values;
    public class RandomSentenceSpout extends BaseRichSpout{
       //用来收集spout的输出tuple
    	private SpoutOutputCollector Collector;
    	//private Random rand;
    	private static final  long SrialversionUID=1l; 
    	
    	@Override
    	public void nextTuple() {
    //	String[] data={"hello zhangsan","nice to meet","you zhangsan hello","lisi welcome to bj"};
    //	Collector.emit(new Values(data[rand.nextInt(data.length-1)]));
    		String[] datas= {"hello zhangsan nice to meet you zhangsan hello lisi welcome to bj"};
    		Values values=new Values(datas[0]);
    		Collector.emit(values);
    		try {
    			Thread.sleep(1000);
    		} catch (InterruptedException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	
    	}
    
    	//初始化操作,只执行一遍
    	@Override
    	public void open(Map conf, TopologyContext context, SpoutOutputCollector Collector ) {
    		this.Collector=Collector;
    	}
    
    	@Override
    	public void declareOutputFields(OutputFieldsDeclarer declarer) {
    		declarer.declare(new Fields("spout"));
    		
    	}
    	
    }
    

      3,bolt组件的代码编写,用来统计字段的数量。

    package com;
    
    import java.util.HashMap;
    import java.util.Map;
    
    import org.apache.storm.task.OutputCollector;
    import org.apache.storm.task.TopologyContext;
    import org.apache.storm.topology.OutputFieldsDeclarer;
    import org.apache.storm.topology.base.BaseRichBolt;
    import org.apache.storm.tuple.Fields;
    import org.apache.storm.tuple.Tuple;
    import org.apache.storm.tuple.Values;
    
    public class WordCount extends BaseRichBolt{
    
    	private static final Long SrialversionUID=1l;
    	private OutputCollector collector;
    	Map<String,Integer>map=new HashMap<String,Integer>();
    	@Override
    	public void execute(Tuple value) {
    		String data = value.getStringByField("word");
    		if(map.containsKey(data)){
    			map.put(data, map.get(data)+1);
    		}else{
    			map.put(data,1);
    		}
    		 System.out.println(map);
    	}
    
    	@Override
    	public void prepare(Map arg0, TopologyContext arg1, OutputCollector collector) {
    		this.collector=collector;
    	}
    
    	@Override
    	public void declareOutputFields(OutputFieldsDeclarer d) {
    		//d.declare(new Fields("words","int"));
    	}
    }
    

      4,编写提交类

    package com;
    
    import org.apache.storm.Config;
    import org.apache.storm.LocalCluster;
    import org.apache.storm.StormSubmitter;
    import org.apache.storm.generated.AlreadyAliveException;
    import org.apache.storm.generated.AuthorizationException;
    import org.apache.storm.generated.InvalidTopologyException;
    import org.apache.storm.topology.TopologyBuilder;
    import org.apache.storm.tuple.Fields;
    
    public class mian {
    
    	public static void main(String[] args) {
    		TopologyBuilder topologyBuilder = new TopologyBuilder();
    		topologyBuilder.setSpout("spout", new RandomSentenceSpout());
    		topologyBuilder.setBolt("wordBolt", new WordBolt()).shuffleGrouping("spout");
    		topologyBuilder.setBolt("wordint", new WordCount()).fieldsGrouping("wordBolt", new Fields("word"));
    		Config config = new Config();
    		if(args==null||args.length==0){
                  //集群模式 LocalCluster localCluster = new LocalCluster(); localCluster.submitTopology("wordCount",config ,topologyBuilder.createTopology()); }else{
                  //单机模式 config.setNumWorkers(1); try { StormSubmitter.submitTopology(args[0],config,topologyBuilder.createTopology()); } catch (AlreadyAliveException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvalidTopologyException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (AuthorizationException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }

      5,打成jar包,上传到服务器运行。注意只打主类的class,不要连带项目中的jar一起打入。否则在集群上面会报错。

      6,Stream Grouping详解 

      Shuffle Grouping: 随机分组, 随机派发stream里面的tuple,保证每个bolt接收到的tuple数目大致相同。

      Fields Grouping:按字段分组,比如按userid来分组,具有同样userid的tuple会被分到相同的Bolts里的一个task,而不同的userid则会被分配到不同的bolts里的task。word 统计用的是Fields Grouping,mapreduce key相同自带就分组

      All Grouping:广播发送,对于每一个tuple,所有的bolts都会收到。

      Global Grouping:全局分组, 这个tuple被分配到storm中的一个bolt的其中一个task。再具体一点就是分配给id值最低的那个task。

      Non Grouping:不分组,这stream grouping个分组的意思是说stream不关心到底谁会收到它的tuple。目前这种分组和Shuffle grouping是一样的效果, 有一点不同的是storm会把这个bolt放到这个bolt的订阅者同一个线程里面去执行。

      Direct Grouping: 直接分组, 这是一种比较特别的分组方法,用这种分组意味着消息的发送者指定由消息接收者的哪个task处理这个消息。只有被声明为Direct Stream的消息流可以声明这种分组方法。而且这种消息tuple必须使用emitDirect方法来发射。消息处理者可以通过TopologyContext来获取处理它的消息的task的id (OutputCollector.emit方法也会返回task的id)。

      Local or shuffle grouping:如果目标bolt有一个或者多个task在同一个工作进程中,tuple将会被随机发生给这些tasks。否则,和普通的Shuffle Grouping行为一致


  • 相关阅读:
    算法初步——哈希表B.1038统计同成绩学生
    算法初步——哈希表B10133.旧键盘打字 (注意bool型数组的赋值为true的方法)
    算法初步——哈希表B1029/A1084. 旧键盘
    算法初步——排序 A1012.The Best Rank(25)
    《思维导图》——东尼博赞
    算法初步——排序B1015/A1062.德才论
    入门模拟——(字符串处理)A1001. A+B Format(20)
    RMQ问题(线段树+ST算法)
    PKU 2406 Power Strings(KMP最长循环不重叠字串)
    KMP算法 kuangbin
  • 原文地址:https://www.cnblogs.com/songweideboke/p/9901083.html
Copyright © 2011-2022 走看看