zoukankan      html  css  js  c++  java
  • Twenty Newsgroups Classification任务之二seq2sparse(3)

    接上篇,如果想对上篇的问题进行测试其实可以简单的编写下面的代码:

    package mahout.fansy.test.bayes.write;
    
    import java.io.IOException;
    import java.util.List;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.SequenceFile;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.Writable;
    import org.apache.mahout.common.Pair;
    import com.google.common.collect.Lists;
    import com.google.common.io.Closeables;
    
    public class TestCreateDictionaryChunks {
    	/**
    	 * @param args
    	 * @throws IOException 
    	 */
    	public static void main(String[] args) throws IOException {
    		String output="hdfs://ubuntu:9000/user/test/test_dictionary";
    		writeToPath(output);
    	}
    	public static void writeToPath(String output) throws IOException{
    		Path path=new Path(output);
    		Configuration conf=new Configuration();
    		conf.set("mapred.job.tracker", "ubuntu:9001");
    		FileSystem fs=FileSystem.get(path.toUri(),conf);
    				
    		SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, path, Text.class, IntWritable.class);
    		int i = 0;
    		try {
    	      for (Pair<Writable,Writable> record
    	           : getInputData()) {
    	        Writable key = record.getFirst();
    	        dictWriter.append(key, new IntWritable(i++));
    	      }
    		}finally{
    			Closeables.closeQuietly(dictWriter);
    		}
    	}
    	
    	public static List<Pair<Writable,Writable> > getInputData(){
    		List<Pair<Writable,Writable> > records= Lists.newArrayList();
    		records.add(new Pair<Writable, Writable>(new Text("a1"),new LongWritable(93)));
    		records.add(new Pair<Writable, Writable>(new Text("a2"),new LongWritable(43)));
    		records.add(new Pair<Writable, Writable>(new Text("a3"),new LongWritable(33)));
    		records.add(new Pair<Writable, Writable>(new Text("a4"),new LongWritable(32)));
    		records.add(new Pair<Writable, Writable>(new Text("a5"),new LongWritable(31)));
    		records.add(new Pair<Writable, Writable>(new Text("a6"),new LongWritable(23)));
    		records.add(new Pair<Writable, Writable>(new Text("a7"),new LongWritable(83)));
    		return records;
    	}
    }
    

    然后在39行(左右也行)设置断点,即可查看dictWriter变量的属性变化。额,我设了断点,但是它的属性太多了,而且好像还用了转码(a用97表示),表示看懂鸭梨山大,所以还没看懂。

    撇开上面的暂时不管,继续往下看,看到这里的调用:

    makePartialVectors(input, baseConf, maxNGramSize, dictionaryChunk, partialVectorOutputPath,
            maxTermDimension[0], sequentialAccess, namedVectors, numReducers);

    这个是第三个Job任务了,这个任务的Mapper就是Mapper,所以Mapper应该是没有做什么任务的;reducer是TFPartialVectorReducer,额reducer有点复杂,至少我看了一遍看的不是很明白,不知道到底代码是要干嘛的,所以我就想另辟蹊径看怎么才能知道这个reducer是干嘛的,所以。。。

    所以我就直接读出上面Reducer的输出不就可以了,好吧,查看job的详细信息,额,输出的文件夹是/home/mahout/mahout-work-mahout/20news-vectors/partial-vectors-0,但是怎么找不到?额,当我看到下面的代码的时候,才发现,原来被删除了:

    HadoopUtil.delete(conf, partialVectorPaths);

    额,好的。那就只能设置断点了,设置断点,然后在DictionaryVectorizer的203行设置就ok了,然后直接dubug,ok,产生了这个文件,编写下面的代码进行读取:

    package mahout.fansy.test.bayes.read;
    
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.Writable;
    import org.apache.mahout.common.iterator.sequencefile.PathFilters;
    import org.apache.mahout.common.iterator.sequencefile.PathType;
    import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
    import org.apache.mahout.math.VectorWritable;
    
    public class ReadPartialVectors0 {
    
    	/**
    	 * @param args
    	 */
    	public static void main(String[] args) {
    		Configuration conf;
    		conf=new Configuration();
    		conf.set("mapred.job.tracker", "ubuntu:9001");
    	//	String path="hdfs://ubuntu:9000/home/mahout/mahout-work-mahout_bak/partial-vectors-0";
    		String path="hdfs://ubuntu:9000/home/mahout/mahout-work-mahout0/20news-vectors/partial-vectors-0/part-r-00000";
    		
    		getValue(path,conf);
    	}
    		/**
    	     * 把序列文件读入到一个变量中;
    	     * @param path 序列文件
    	     * @param conf  Configuration
    	     * @return  序列文件读取的变量
    	     */
    	    public static List<VectorWritable> getValue(String path,Configuration conf){
    	    	Path hdfsPath=new Path(path);
    	    	List<VectorWritable> list = new ArrayList<VectorWritable>();
    	    	for (Writable value : new SequenceFileDirValueIterable<Writable>(hdfsPath, PathType.LIST,
    	    	        PathFilters.partFilter(), conf)) {
    	    	      Class<? extends Writable> valueClass = value.getClass();
    	    	      if (valueClass.equals(VectorWritable.class)) {
    	    	    	  VectorWritable st = (VectorWritable) value;
    	    	          list.add(st);
    	    	      } else {
    	    	        throw new IllegalStateException("Bad value class: " + valueClass);
    	    	      }
    	    	    }
    	    	return list;
    	    }
    }
    
    

    首先说下这个Job的输入是input-folder: /home/mahout/mahout-work-mahout/20news-vectors/tokenized-documents, dictionary-file: /home/mahout/mahout-work-mahout/20news-vectors/dictionary.file-0,设置断点进行第一个value的读取,读取到的VectorWritable有三个属性分别是:conf、vector、writesLaxPrecision,重点关注vector属性,点开这个属性,可以看到这个属性还包含两个属性:delegate、name,分别点击可以看到下面的属性值:

    delegate:

    {2860:1.0,77227:1.0,3891:1.0,6907:1.0,93219:1.0,8840:1.0,11880:1.0,52668:1.0,14985:1.0,61480:1.0,17162:1.0,17967:1.0,74642:1.0,20094:1.0,21897:1.0,23545:1.0,8768:1.0,26088:1.0,89680:2.0,27788:1.0,30497:2.0,32005:1.0,34520:1.0,24587:1.0,36683:1.0,24585:1.0,37499:1.0,39074:1.0,41504:1.0,43181:1.0,15782:1.0,44299:1.0,80863:1.0,47408:1.0,48633:1.0,50212:2.0,52684:1.0,53133:2.0,56028:1.0,57362:1.0,72034:1.0,59242:1.0,60435:1.0,62310:1.0,63329:3.0,65367:1.0,87875:3.0,67166:1.0,80837:1.0,68924:1.0,31669:1.0,70770:2.0,12231:1.0,91347:2.0,75330:1.0,68516:1.0,75991:2.0,78034:1.0,79870:1.0,56197:1.0,81770:1.0,19256:4.0,83383:1.0,86089:1.0,26422:1.0,87772:1.0,61466:1.0,42221:1.0,31560:1.0,86088:1.0,36835:3.0,12207:1.0,36832:6.0,10574:1.0,43931:1.0,10643:1.0,79071:1.0,15710:1.0,15709:1.0,52646:1.0,15705:1.0,52744:1.0,28015:1.0,10662:1.0,61561:1.0,7929:1.0,68463:1.0,64943:4.0,36948:1.0,63178:1.0,71972:1.0,80760:1.0,31687:1.0,1600:2.0,7182:1.0,80749:1.0,89671:1.0,13903:1.0,36961:1.0,71947:3.0,13899:2.0,17414:1.0,73700:2.0,71940:6.0,86011:1.0,13890:1.0,29719:4.0,8610:1.0,20916:1.0,31095:1.0,63129:1.0,87754:1.0,21259:13.0,15291:1.0,33214:2.0,89716:2.0,31450:3.0,70143:2.0,80693:1.0,41993:3.0,57822:1.0,65162:1.0,27916:1.0,68370:8.0,47257:1.0,70123:1.0,47255:1.0,85949:1.0,71871:1.0,45485:1.0,31411:1.0,68345:2.0,31405:1.0,61306:1.0,6776:1.0,36678:1.0,77482:1.0,43711:2.0,75372:1.0,80645:1.0,82667:2.0,19077:1.0,15558:2.0,31388:1.0,61285:2.0,31381:1.0,89423:1.0,38411:1.0,48963:3.0,91177:3.0,19054:1.0,45438:1.0,31365:1.0,40154:1.0,43668:8.0,91160:1.0,91159:1.0,68290:1.0,66529:3.0,45419:1.0,45418:5.0,91150:6.0,47068:1.0,59485:1.0,52448:1.0,55965:3.0,13748:2.0,8466:1.0,38944:2.0,13737:1.0,66504:1.0,31929:1.0,68261:1.0,11972:1.0,15487:1.0,19004:1.0,1413:1.0,33073:1.0,54175:4.0,55932:4.0,55931:1.0,64725:3.0,85832:1.0,27784:2.0,38971:1.0,66479:1.0,61200:1.0,31956:1.0,10722:1.0,36563:1.0,82862:2.0,8414:1.0,18961:1.0,1369:1.0,18958:2.0,68843:5.0,41467:1.0,76989:2.0,54813:2.0,82894:2.0,59393:1.0,51309:1.0,19722:1.0,68863:1.0,64664:2.0,17168:5.0,6613:1.0,92802:1.0,25959:1.0,70000:1.0,11885:1.0,61132:1.0,92793:1.0,38263:1.0,78719:1.0,91031:1.0,21501:3.0,45290:1.0,92781:2.0,46077:2.0,84688:1.0,60120:1.0,67142:1.0,27691:3.0,40003:1.0,82217:2.0,43517:1.0,43516:1.0,71659:1.0,27682:1.0,38234:1.0,6569:1.0,20639:2.0,6562:1.0,40840:1.0,17112:2.0,10075:1.0,90987:4.0,71636:1.0,11828:1.0,61078:1.0,78667:1.0,90978:1.0,34687:1.0,57553:1.0,22366:1.0,25883:1.0,8292:2.0,31158:1.0,27639:1.0,73371:2.0,38189:1.0,90958:2.0,40876:3.0,22355:1.0,24112:1.0,48729:1.0,20583:1.0,25859:2.0,90939:1.0,43445:2.0,41413:1.0,59266:3.0,41672:1.0,75090:1.0,11080:1.0,59248:1.0,66281:1.0,80352:2.0,39893:1.0,38131:1.0,36371:1.0,38128:3.0,2947:1.0,2946:1.0,2944:1.0,80339:2.0,78577:3.0,11114:1.0,27559:1.0,45148:1.0,68012:1.0,41626:1.0,63772:1.0,83839:2.0,90874:1.0,55693:1.0,90870:1.0,90869:1.0,6436:1.0,6434:1.0,36336:1.0,78551:3.0,89104:1.0,34574:1.0,2910:1.0,74324:2.0,48637:1.0,36323:1.0,62706:2.0,55669:1.0,38077:1.0,34556:1.0,31036:1.0,34553:3.0,39829:2.0,9924:1.0,46861:2.0,53896:1.0,22232:3.0,46857:2.0,46856:3.0,45096:1.0,66201:1.0,53887:1.0,76751:1.0,32773:2.0,9904:1.0,85538:2.0,64425:1.0,5924:1.0,55628:1.0,11652:1.0,55623:1.0,66175:1.0,90173:1.0,71450:1.0,11638:1.0,76720:3.0,43295:2.0,6353:1.0,30978:2.0,16905:1.0,39297:1.0,85501:1.0,23935:1.0,48560:1.0,67906:2.0,53833:1.0,55589:1.0,53829:1.0,23923:1.0,50303:2.0,67892:1.0,86705:1.0,31087:1.0,27432:2.0,16875:1.0,3870:1.0,51612:1.0,32701:1.0,60844:2.0,64055:1.0,60842:1.0,67876:1.0,6309:1.0,67873:3.0,41486:1.0,17926:1.0,62589:2.0,44992:2.0,46746:1.0,66094:1.0,80165:1.0,87199:1.0,67847:2.0,43220:1.0,67845:1.0,90710:4.0,53769:1.0,89938:2.0,87182:1.0,41445:15.0,66070:1.0,87177:4.0,43199:1.0,56938:1.0,87171:3.0,30881:1.0,7803:1.0,78370:3.0,67811:1.0,67810:2.0,48459:1.0,44940:1.0,85393:1.0,16791:2.0,957:1.0,17085:1.0,83627:1.0,27200:1.0,36132:1.0,90652:1.0,63999:2.0,41396:1.0,9732:1.0,24119:1.0,37874:1.0,39435:2.0,27317:1.0,6207:1.0,30832:1.0,83600:1.0,14915:1.0,55453:1.0,46656:1.0,43136:1.0,43134:1.0,71277:1.0,11469:2.0,73032:2.0,51742:2.0,74789:1.0,78304:3.0,88857:2.0,60706:2.0,64041:1.0,6127:1.0,34317:1.0,23679:1.0,48387:5.0,55422:1.0,44744:1.0,20233:1.0,34301:1.0,20227:1.0,58923:1.0,20223:8.0,87063:1.0,69333:4.0,83543:1.0,83538:1.0,39560:14.0,18449:1.0,83532:2.0,25484:1.0,27241:1.0,80010:1.0,34275:1.0,71212:1.0,34271:2.0,46542:2.0,32503:1.0,14917:2.0,53613:1.0,71118:1.0,76475:1.0,6195:1.0,69428:1.0,27209:1.0,62388:1.0,23689:1.0,27205:1.0,818:1.0,83488:1.0,14886:1.0,62371:2.0,43028:1.0,39508:1.0,44784:2.0,37747:2.0,55336:1.0,55335:2.0,14876:4.0,62365:3.0,48290:1.0,71150:1.0,53559:1.0,50533:1.0,72928:1.0,16615:1.0,27708:1.0,21889:1.0,16608:1.0,44750:1.0,20123:1.0,23801:1.0,25396:2.0,83442:1.0,25394:4.0,16598:2.0,23810:1.0,18546:1.0,25385:4.0,30661:1.0,81667:6.0,41372:1.0,37690:1.0,71107:1.0,76383:2.0,25371:1.0,8039:3.0,2496:1.0,39434:1.0,55429:1.0,6011:1.0,67716:1.0,51743:1.0,2490:1.0,60536:1.0,86920:1.0,27109:1.0,30626:1.0,23589:2.0,1038:5.0,79872:1.0,49968:3.0,14787:1.0,76350:2.0,20061:1.0,34132:2.0,27095:3.0,53479:1.0,67550:1.0,39404:2.0,2464:1.0,71064:1.0,71063:1.0,85132:1.0,18286:2.0,67536:1.0,66001:6.0,76329:1.0,71049:1.0,69516:1.0,53722:2.0,85118:1.0,64254:3.0,82194:1.0,27066:1.0,62244:1.0,64002:1.0,66017:1.0,64637:1.0,59115:1.0,83347:1.0,42889:1.0,71663:1.0,46405:1.0,8115:2.0,36548:1.0,23525:1.0,62222:2.0,39184:2.0,16484:3.0,1109:1.0,30550:1.0,37583:2.0,19991:2.0,32303:1.0,46372:1.0,67116:1.0,11186:1.0,60437:1.0,83302:1.0,11181:1.0,37989:1.0,72742:1.0,28763:1.0,28762:1.0,49868:2.0,16446:1.0,28758:1.0,39310:1.0,90320:1.0,83283:1.0,18199:1.0,72726:2.0,16436:1.0,83277:1.0,74866:2.0,30504:2.0,55565:3.0,83271:3.0,90303:1.0,72712:1.0,90299:1.0,14661:1.0,81901:1.0,7621:1.0,8195:2.0,53826:1.0,90289:1.0,64360:1.0,23996:1.0,25200:1.0,48065:1.0,80163:1.0,37508:1.0,52087:1.0,9360:1.0,51574:1.0,18747:1.0,62126:3.0,48053:1.0,28703:1.0,18146:2.0,5871:1.0,87202:2.0,26935:1.0,90257:3.0,5824:1.0,19891:1.0,32202:1.0,40994:1.0,76173:2.0,72652:1.0,32191:1.0,36615:3.0,37466:2.0,27558:1.0,72639:1.0,58564:2.0,28660:2.0,53284:1.0,24065:1.0,69109:1.0,21615:2.0,90214:1.0,53905:1.0,25827:1.0,19849:1.0,88449:1.0,84930:1.0,49749:1.0,64446:1.0,59182:1.0,44466:6.0,39187:1.0,53257:1.0,88436:2.0,86675:3.0,9278:1.0,33902:1.0,55684:1.0,84910:1.0,35657:1.0,17078:4.0,76106:2.0,11823:2.0,44439:5.0,90166:10.0,19805:1.0,11834:2.0,19803:1.0,86643:1.0,90160:1.0,19798:3.0,30351:1.0,81360:1.0,52209:1.0,33864:1.0,18032:1.0,18031:1.0,76077:1.0,54220:1.0,49689:1.0,30336:1.0,19980:1.0,90139:2.0,70788:1.0,70787:1.0,27659:1.0,70784:1.0,76059:1.0,35599:1.0,81332:1.0,14489:4.0,10970:1.0,6610:1.0,14484:1.0,83206:2.0,88360:1.0,81323:1.0,23274:1.0,46992:1.0,90105:3.0,72514:1.0,27690:1.0,60199:1.0,17981:1.0,40847:4.0,90875:1.0,28527:1.0,53151:1.0,17970:1.0,47870:2.0,76013:1.0,27709:1.0,7409:1.0,3887:1.0,55798:1.0,60169:3.0,3878:1.0,3875:1.0,19704:1.0,3872:1.0,16184:1.0,81266:1.0,9142:1.0,3864:1.0,17935:1.0,7379:1.0,65425:1.0,33008:1.0,3855:1.0,72455:5.0,17925:2.0,40791:1.0,31993:2.0,19674:1.0,86515:1.0,75960:1.0,75958:3.0,9112:1.0,88264:2.0,67155:1.0,82985:1.0,88261:1.0,70670:1.0,56597:1.0,70664:1.0,51314:1.0,44277:1.0,21409:1.0,82973:1.0,90008:1.0,53068:1.0,88246:1.0,93522:1.0,63618:3.0,68894:1.0,88242:3.0,65370:1.0,19634:1.0,75921:1.0,31945:1.0,63605:1.0,51291:1.0,56566:1.0,15524:2.0,80460:2.0,63597:3.0,58317:2.0,19617:1.0,24893:1.0,24890:2.0,89971:1.0,17851:1.0,17850:2.0,17848:10.0,17847:11.0,30159:4.0,36604:1.0,31344:3.0,54778:2.0,12560:1.0,2005:2.0,28389:1.0,31906:3.0,79778:1.0,79397:1.0,67082:1.0,16068:1.0,70595:1.0,47726:1.0,38383:1.0,75866:1.0,92791:1.0,75247:1.0,31883:1.0,15699:3.0,51222:1.0,52979:1.0,19556:1.0,9000:2.0,3722:1.0,66498:2.0,45928:3.0,38887:1.0,45922:1.0,38428:6.0,3701:1.0,33601:1.0,36680:1.0,70538:1.0,85079:1.0,16006:1.0,54700:1.0,63016:1.0,24787:1.0,66534:1.0,86348:1.0,63028:1.0,40611:1.0,33574:1.0,10706:1.0,65234:1.0,3668:1.0,58194:1.0,81060:1.0,21251:4.0,38840:1.0,86331:3.0,44110:1.0,86325:1.0,81047:1.0,63450:1.0,70080:1.0,56411:3.0,35301:1.0,63444:2.0,82791:4.0,15947:1.0,71847:1.0,60529:1.0,68709:1.0,56198:1.0,91161:2.0,15933:3.0,77496:1.0,15929:5.0,17687:3.0,15926:1.0,71981:1.0,17683:2.0,65172:1.0,37025:1.0,58130:1.0,17671:1.0,72198:1.0,65161:1.0,65159:3.0,13970:1.0,70432:1.0,65154:1.0,84502:1.0,14139:1.0,56351:1.0,72178:1.0,86248:2.0,51067:1.0,47543:1.0,61439:4.0,71912:1.0,19394:1.0,64955:1.0,45773:1.0,14109:1.0,26421:1.0,33351:1.0,15792:2.0,41254:1.0,77422:1.0,17615:1.0,42240:1.0,68624:1.0,65105:1.0,15779:1.0,66858:1.0,7050:1.0,8769:1.0,70195:2.0,3526:1.0,7043:1.0,10560:1.0,86196:1.0,77400:1.0,0:6.0}

    name:

    /alt.atheism/49960

    看到这两个属性值后,大概可以猜测这个Job的作用是把输入document-token的每个文件的单词都先转换为dictionary中的对应的数值,然后再在后面加上其重复的个数即可得到delegate的值了,当然这个也只是猜测,不过估计八九不离十吧。

    再说吧。。。

    分享,快乐,成长


    转载请注明出处:http://blog.csdn.net/fansy1990 


  • 相关阅读:
    POJ 1659 Frogs' Neighborhood
    zoj 2913 Bus Pass(BFS)
    ZOJ 1008 Gnome Tetravex(DFS)
    POJ 1562 Oil Deposits (DFS)
    zoj 2165 Red and Black (DFs)poj 1979
    hdu 3954 Level up
    sgu 249 Matrix
    hdu 4417 Super Mario
    SPOJ (BNUOJ) LCM Sum
    hdu 2665 Kth number 划分树
  • 原文地址:https://www.cnblogs.com/suncoolcat/p/3292111.html
Copyright © 2011-2022 走看看