zoukankan      html  css  js  c++  java
  • nnet3bin/nnet3-xvector-compute.cc

    将特征在xvector神经网络模型中前向传播,并写出输出向量。我们将说话人识别的特定神经网络结构的输出向量或embedding称之为"Xvector"。该网络结构包括:帧级别的多个前馈层、帧级别之上的聚合层、统计池化层以及段级别的附加层。通常在统计池化层之后的输出层提取xvector。默认情况下,每个语句生成一个xvector。根据需要,可以chunk中提取多个xvector并求平均,以生成单个矢量。

       

    Usage: nnet3-xvector-compute [options] <raw-nnet-in> <features-rspecifier> <vector-wspecifier>

    e.g.: nnet3-xvector-compute final.raw scp:feats.scp ark:nnet_prediction.ark

       

    一个语音特征chunk,生成一个xvector

    static void RunNnetComputation(const MatrixBase<BaseFloat> &features,

    const Nnet &nnet, CachingOptimizingCompiler *compiler,

    Vector<BaseFloat> *xvector) {

    ComputationRequest request;

    request.need_model_derivative = false;

    request.store_component_stats = false;

    request.inputs.push_back(

    IoSpecification("input", 0, features.NumRows()));

    IoSpecification output_spec;

    output_spec.name = "output";

    output_spec.has_deriv = false;

       

    output-node所请求的输出Cindex索引数限制为1,这样,一个chunksegment)只输出一个结果,即xvector

    output_spec.indexes.resize(1);

       

    request.outputs.resize(1);

    request.outputs[0].Swap(&output_spec);

    std::shared_ptr<const NnetComputation> computation(std::move(compiler->Compile(request)));

    Nnet *nnet_to_update = NULL; // we're not doing any update.

    NnetComputer computer(NnetComputeOptions(), *computation,

    nnet, nnet_to_update);

    CuMatrix<BaseFloat> input_feats_cu(features);

    computer.AcceptInput("input", &input_feats_cu);

    computer.Run();

    CuMatrix<BaseFloat> cu_output;

    //输出的cu_output为行数为1的矩阵

    computer.GetOutputDestructive("output", &cu_output);

    xvector->Resize(cu_output.NumCols());

    //取输出矩阵的第一行向量作为xvector

    xvector->CopyFromVec(cu_output.Row(0));

    }

       

    ParseOptions po(usage);

    Timer timer;

       

    NnetSimpleComputationOptions opts;

    CachingOptimizingCompilerOptions compiler_config;

       

    opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.

       

    std::string use_gpu = "no";

    int32 chunk_size = -1,

    min_chunk_size = 100;

    //若帧组不足一个chunk,则对input进行左右padding

    bool pad_input = true;

       

    opts.Register(&po);

    compiler_config.Register(&po);

       

    po.Register("use-gpu", &use_gpu,

    "yes|no|optional|wait, only has effect if compiled with CUDA");

    po.Register("chunk-size", &chunk_size,

    "If set, extracts xectors from specified chunk-size, and averages. "

    "If not set, extracts an xvector from all available features.");

    po.Register("min-chunk-size", &min_chunk_size,

    "Minimum chunk-size allowed when extracting xvectors.");

    po.Register("pad-input", &pad_input, "If true, duplicate the first and "

    "last frames of the input features as required to equal min-chunk-size.");

       

    po.Read(argc, argv);

       

    if (po.NumArgs() != 3) {

    po.PrintUsage();

    exit(1);

    }

       

    #if HAVE_CUDA==1

    CuDevice::Instantiate().SelectGpuId(use_gpu);

    #endif

       

    std::string nnet_rxfilename = po.GetArg(1),

    feature_rspecifier = po.GetArg(2),

    vector_wspecifier = po.GetArg(3);

       

    Nnet nnet;

    ReadKaldiObject(nnet_rxfilename, &nnet);

    SetBatchnormTestMode(true, &nnet);

    SetDropoutTestMode(true, &nnet);

    CollapseModel(CollapseModelConfig(), &nnet);

       

    CachingOptimizingCompiler compiler(nnet, opts.optimize_config, compiler_config);

       

    BaseFloatVectorWriter vector_writer(vector_wspecifier);

       

    int32 num_success = 0, num_fail = 0;

    int64 frame_count = 0;

    int32 xvector_dim = nnet.OutputDim("output");

       

    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);


    for (; !feature_reader.Done(); feature_reader.Next()) {

    std::string utt = feature_reader.Key();

    const Matrix<BaseFloat> &features (feature_reader.Value());

    if (features.NumRows() == 0) {

    KALDI_WARN << "Zero-length utterance: " << utt;

    num_fail++;

    continue;

    }

    int32 num_rows = features.NumRows(),

    feat_dim = features.NumCols(),

    this_chunk_size = chunk_size;

    if (!pad_input && num_rows < min_chunk_size) {

    KALDI_WARN << "Minimum chunk size of " << min_chunk_size

    << " is greater than the number of rows "

    << "in utterance: " << utt;

    num_fail++;

    continue;

    } else if (num_rows < chunk_size) {

    KALDI_LOG << "Chunk size of " << chunk_size << " is greater than "

    << "the number of rows in utterance: " << utt

    << ", using chunk size of " << num_rows;

    this_chunk_size = num_rows;

    } else if (chunk_size == -1) {

    this_chunk_size = num_rows;

    }

    //num_chunks=1

    int32 num_chunks = ceil(

    num_rows / static_cast<BaseFloat>(this_chunk_size));

    Vector<BaseFloat> xvector_avg(xvector_dim, kSetZero);

    BaseFloat tot_weight = 0.0;

       

    // Iterate over the feature chunks.

    for (int32 chunk_indx = 0; chunk_indx < num_chunks; chunk_indx++) {

    //若接近输入的末尾,需要考虑剩余的帧是否足以凑足一个chunk

    int32 offset = std::min(

    this_chunk_size, num_rows - chunk_indx * this_chunk_size);

    if (!pad_input && offset < min_chunk_size)

    continue;

    SubMatrix<BaseFloat> sub_features(

    features, chunk_indx * this_chunk_size, offset, 0, feat_dim);

    Vector<BaseFloat> xvector;

    tot_weight += offset;

       

    // Pad input if the offset is less than the minimum chunk size

    if (pad_input && offset < min_chunk_size) {

    Matrix<BaseFloat> padded_features(min_chunk_size, feat_dim);

    int32 left_context = (min_chunk_size - offset) / 2;

    int32 right_context = min_chunk_size - offset - left_context;

    for (int32 i = 0; i < left_context; i++) {

    padded_features.Row(i).CopyFromVec(sub_features.Row(0));

    }

    for (int32 i = 0; i < right_context; i++) {

    padded_features.Row(min_chunk_size - i - 1).CopyFromVec(sub_features.Row(offset - 1));

    }

    padded_features.Range(left_context, offset, 0, feat_dim).CopyFromMat(sub_features);

    //一个chunk生成一个xvector

    RunNnetComputation(padded_features, nnet, &compiler, &xvector);

    } else {

    RunNnetComputation(sub_features, nnet, &compiler, &xvector);

    }

    //将所有chunkxvectors进行累加

    xvector_avg.AddVec(offset, xvector);

    }

    //求所有chunk的平均xvector

    xvector_avg.Scale(1.0 / tot_weight);

    vector_writer.Write(utt, xvector_avg);

       

    frame_count += features.NumRows();

    num_success++;

    }

      

     

  • 相关阅读:
    PAT乙级真题1004. 成绩排名 (20)(解题)
    PAT乙级真题1003. 我要通过!(20)(解题)
    PAT乙级真题1002. 写出这个数 (20)(解题)
    PAT乙级真题1001. 害死人不偿命的(3n+1)猜想 (15)(解题)
    2015-03-06——ajax基础
    2015-03-06——正则表达式基础
    2015-02-09——js笔记
    2015-02-08——js笔记
    2015-02-07——js笔记
    2014-10-28——iframe多层嵌套时获取元素总结
  • 原文地址:https://www.cnblogs.com/JarvanWang/p/10146015.html
Copyright © 2011-2022 走看看