zoukankan      html  css  js  c++  java
  • 记一次深刻的教训-----将mat数据转化为SequenceFile

    深刻的体会就是,“java.lang.NullPointer.Exception”就是空指针异常可能是由于数组部分元素未被初始化引起的。

    1)使用jmatio将mat数据转化为SequenceFile形式的数据,代码如下:

     1 /**
     2  * Created with IntelliJ IDEA.
     3  * User: hadoop
     4  * Date: 16-3-6
     5  * Time: 上午10:56
     6  * To change this template use File | Settings | File Templates.
     7  */
     8 import com.jmatio.io.MatFileReader;
     9 import com.jmatio.types.*;
    10 import java.io.IOException;
    11 import org.apache.hadoop.conf.Configuration;
    12 import org.apache.hadoop.fs.FileSystem;
    13 import org.apache.hadoop.fs.Path;
    14 import org.apache.hadoop.io.*;
    15 import java.net.URI;
    16 import org.apache.hadoop.mapreduce.*;
    17 public class mat2Seq {
    18 public static void main(String[] args) throws IOException {
    19     writeMat2Seq("data/100_100/F100.mat","SeqOutput/F");
    20     writeMat2Seq("data/100_100/b100.mat","SeqOutput/b");
    21     writeMat2Seq("data/100_100/d100.mat","SeqOutput/d");
    22     writeMat2Seq("data/100_100/s100.mat","SeqOutput/s");
    23     writeMat2Seq("data/100_100/u100.mat","SeqOutput/u");
    24 }
    25 
    26 public static void writeMat2Seq(String matPath,String SeqOutput) throws IOException {
    27         MatFileReader reader=new MatFileReader(matPath);
    28         MLArray mlArray=reader.getMLArray("a");
    29         MLDouble doubleValue=(MLDouble)mlArray;
    30         double[][] matrix=doubleValue.getArray();
    31         Configuration conf =new Configuration();
    32         FileSystem fs=FileSystem.get(URI.create(SeqOutput),conf);
    33         IntWritable key=new IntWritable();
    34         DoubleArrayWritable value=new DoubleArrayWritable();
    35         SequenceFile.Writer writer=null;
    36         try {
    37             writer=SequenceFile.createWriter(fs,conf,new Path(SeqOutput),key.getClass(),
    38                     value.getClass());
    39             if (matPath.endsWith("F100.mat")){    //左矩阵F依次将行存储到Seq
    40                 DoubleWritable[] rowVector=new DoubleWritable[matrix.length];
    41                 for (int i=0;i<matrix.length;++i){
    42                     for (int j=0;j<matrix[0].length;++j){
    43                        rowVector[j]=new DoubleWritable(0);
    44                        rowVector[j].set(matrix[i][j]);
    45                     }
    46                     value.set(rowVector);
    47                     key.set(i);
    48                     writer.append(key,value);
    49                 }
    50                 writer.close();
    51             }
    52             else{          //其他右矩阵依次将列存储到Seq中
    53                 DoubleWritable[] columnVector=new DoubleWritable[matrix[0].length];
    54                 for (int i=0;i<matrix[0].length;++i){
    55                     for (int j=0;j<matrix.length;++j){
    56                         columnVector[j]=new DoubleWritable(0);
    57                         columnVector[j].set(matrix[j][i]);
    58                     }
    59                     value.set(columnVector);
    60                     key.set(i);
    61                     writer.append(key,value);
    62                 }
    63                 writer.close();
    64 
    65             }
    66         }
    67         finally {
    68         }
    69     System.out.println(matPath+"write done!");
    70     }
    71 }
    72 class DoubleArrayWritable extends ArrayWritable {
    73     public DoubleArrayWritable(){
    74         super(DoubleWritable.class);
    75     }
    76     public String toString(){
    77         StringBuilder sb=new StringBuilder();
    78         for (Writable val:get()){
    79             DoubleWritable doubleWritable=(DoubleWritable)val;
    80             sb.append(doubleWritable.get());
    81             sb.append(",");
    82         }
    83         sb.deleteCharAt(sb.length()-1);
    84         return sb.toString();
    85     }
    86 }

    以上使用的.mat文件,程序都可以好好的运行。但是当把文件换成一个B1k2k,也就是一个1000*2000的矩阵文件时,就报空指针的异常,“java.lang.NullPointerException”,具体如下:

     

     提示是在ArrayWritable.write()方法中出现空指针的异常,就开始怀疑是ArrayWritable这个类没写好(也就是怀疑人家有bug,然后就下了hadoop2.6.4,还是不行),然后一路追查,到最后ArrayWritable的write()方法最终调用了BufferedOutputStream.write(),然后就开始是open-jdk的不兼容了,然后就重装了sun JDK。还是不行,然后就把java的io包里的.java源码拷贝到工程里,想着单步调试到BufferedOutputStream.write(),看看究竟发生了什么,怎奈jmatio需要用到io包,我又不行重新编译,所以就想先把.mat转化为.txt文件,但是呢,没成功,因为虚拟机磁盘空间不够了,没法了,想想是不是机子环境的问题呢(哈哈哈,想象力太好),就传给谷总试试看能不能运行,谷总说你确定不是算法问题?答:不该呀。呵呵呵,结果谷总发来了两张图片,如下:

     

     

    至此,终于找到了原因,就是代码有问题。columnVector数组是用来存储矩阵的一列,数组长度矩阵的行数,但是原先的代码里却将数组长度定义为矩阵的列数。

     DoubleWritable[] columnVector=new DoubleWritable[matrix[0].length];
     for (int i=0;i<matrix[0].length;++i){
     for (int j=0;j<matrix.length;++j){
           columnVector[j]=new DoubleWritable(0);
           columnVector[j].set(matrix[j][i]);
    }
    value.set(columnVector);
    key.set(i);
    writer.append(key,value);
    }

    这就解释了,为什么1k*1100的矩阵转化时会提示空指针异常,按照上述对columnVector的定义,这个列向量数组的长度是1100,但是在接下来给这个向量赋值时,是由矩阵的行数来控制,也就是说在赋值时只是对columnVector复制到第1000个元素,剩下的100个元素是空(如果是系统的基本类型,如int double,编译器会将其置为0,不过DoubleWritable不是基本类型),也就是“null”,所以在接下来使用writer.append(key,vlaue)调用输出流写出的时候,自然会抛出“java.lang.NullPointerException”异常。经过修改的代码如下:

     1 /**
     2  * Created with IntelliJ IDEA.
     3  * User: hadoop
     4  * Date: 16-3-6
     5  * Time: 上午10:56
     6  * To change this template use File | Settings | File Templates.
     7  */
     8 //package java.io;
     9 import com.jmatio.io.MatFileReader;
    10 import com.jmatio.types.*;
    11 import java.io.IOException;
    12 import org.apache.hadoop.conf.Configuration;
    13 import org.apache.hadoop.fs.FileSystem;
    14 import org.apache.hadoop.fs.Path;
    15 import org.apache.hadoop.io.*;
    16 public class mat2Seq {
    17     public static void main(String[] args) throws IOException {
    18         writeMat2Seq("data/1k_1k/F1k1k.mat","SeqOutput/F1k1k");
    19         writeMat2Seq("data/100_100/b100.mat","SeqOutput/b100");
    20         writeMat2Seq("data/1k1100/mat1k1100.mat","SeqOutput/test1k1100");
    21         writeMat2Seq("data/B1k2w.mat","SeqOutput/1k2w");
    22 
    23         //writeMat2Seq("data/1k_2w/B1k2w.mat","SeqOutput5/B1k2w");
    24     }
    25 
    26     public static void writeMat2Seq(String matPath,String SeqOutput) throws IOException {
    27         MatFileReader reader=new MatFileReader(matPath);
    28         MLArray mlArray=reader.getMLArray("a");
    29         MLDouble doubleValue=(MLDouble)mlArray;
    30         double[][] matrix=doubleValue.getArray();
    31         Configuration conf =new Configuration();
    32         //FileSystem fs=FileSystem.get(URI.create(SeqOutput),conf);
    33         FileSystem fs=FileSystem.get(conf);
    34         Path path=new Path(SeqOutput);
    35         //FSDataOutputStream outputStream=fs.create(path);
    36         IntWritable key=new IntWritable();
    37         DoubleArrayWritable value=new DoubleArrayWritable();
    38         SequenceFile.Writer writer=null;
    39         try {
    40             writer=SequenceFile.createWriter(fs,conf,path,key.getClass(),value.getClass());
    41 
    42             // SequenceFile.Writer.Option
    43             if (matPath.endsWith("F1k.mat")){    //左矩阵F依次将行存储到Seq
    44                 DoubleWritable[] rowVector=new DoubleWritable[matrix[0].length];
    45                 for (int i=0;i<matrix.length;++i){
    46                     for (int j=0;j<matrix[0].length;++j){
    47                         rowVector[j]=new DoubleWritable(0);
    48                         rowVector[j].set(matrix[i][j]);
    49                     }
    50                     value.set(rowVector);
    51                     key.set(i);
    52                     writer.append(key,value);
    53                 }
    54                 writer.close();
    55                 //outputStream.close();
    56                 fs.close();
    57             }
    58             else{          //其他右矩阵依次将列存储到Seq中
    59                 //DoubleWritable[] columnVector=new DoubleWritable[matrix[0].length];
    60                 DoubleWritable[] columnVector=new DoubleWritable[matrix.length];
    61                 for (int i=0;i<matrix[0].length;++i){
    62                     for (int j=0;j<matrix.length;++j){
    63                         columnVector[j]=new DoubleWritable(0);
    64                         columnVector[j].set(matrix[j][i]);
    65                     }
    66                     value.set(columnVector);
    67                     key.set(i);
    68                     writer.append(key,value);
    69                 }
    70                 writer.close();
    71                 //outputStream.close();
    72                 fs.close();
    73 
    74             }
    75         }
    76         finally {
    77         }
    78         System.out.println(matPath+"write done!");
    79     }
    80 }
    81 class DoubleArrayWritable extends ArrayWritable {
    82     public DoubleArrayWritable(){
    83         super(DoubleWritable.class);
    84     }
    85     /*
    86     public String toString(){
    87         StringBuilder sb=new StringBuilder();
    88         for (Writable val:get()){
    89             DoubleWritable doubleWritable=(DoubleWritable)val;
    90             sb.append(doubleWritable.get());
    91             sb.append(",");
    92         }
    93         sb.deleteCharAt(sb.length()-1);
    94         return sb.toString();
    95     }
    96     */
    97 }

     另外,就是把DoubleArrayWritable的toString()方法注释掉是有原因的,如果使用这个新定义的toString()方法,写入SequenceFile中的value形式就是0.344,0.435......,这种矩阵形式(使用hadoop fs -text),形式一目了然,但是文件写入速度慢(B1k2w文件需要两分钟才可以完成)。如果使用Object提供的toString()方法的话,写入的value形式就是DoubleArrayWritable@34d79f形式,看着不直观,如果要查看value的值还必须使用程序反序列化,但是这种方法写入的内容很少,文件的写入速度很快(B1k2w文件只需要1~2s就可完成)。所以还是不要重载toString()方法。

  • 相关阅读:
    代码管理模型概况
    循环链表
    队列

    链表
    java 2020-10-12T11:22:49.000+0800 字符串转换成正常时间格式
    动态数组
    mysql练习
    复杂度与LeetCode
    记一次带逗号的数字类型处理
  • 原文地址:https://www.cnblogs.com/lz3018/p/5247101.html
Copyright © 2011-2022 走看看