  • MTCNN代码解读


    #include "mtcnn.hpp"
    #include "utils.hpp"
    using namespace std;
    using namespace bmruntime;
    MTCNN::MTCNN(const vector<string>& bmodel) {
      min_size_ = 40;//为外部设置的检测图像中人脸的最小尺寸
      min_pyramid_size_ = 12;
      factor_ = 0.5;
      in_w_ = 1920;
      in_h_ = 1080;
      pnet_ = new Net(bmodel[0]);
      rnet_ = new Net(bmodel[1]);
      onet_ = new Net(bmodel[2]);
      in_data_ = new float[1 * 3 * in_w_ * in_h_];
      ts_ = nullptr;
    MTCNN::~MTCNN() {
      delete pnet_;
      delete rnet_;
      delete onet_;
      delete []in_data_;
    static inline bool compareBBox(const FaceRect &a, const FaceRect &b) {
      return a.score > b.score;
    *	src :w*h=640*480
    *	dst :
    float MTCNN::rescale_image(const cv::Mat &src, cv::Mat *dst) {
      float ratio = 1.0;
      if (((size_t)src.rows == in_h_) && ((size_t)src.cols == in_w_)) {
        ratio = 1.0;
        *dst = src;
        return ratio;
      //ratio = 0.444444 = std::max(1.0 * 480 / 1080, 1.0 * 640 /1920)
      ratio = std::max(1.0 * src.rows / in_h_, 1.0 * src.cols /in_w_);
      //pad_bottom = 0 = 1080 - 480/0.444
      int pad_bottom = in_h_ - src.rows / ratio;
      //pad_right = 480 = 1920 - 640/0.444
      int pad_right = in_w_ - src.cols / ratio;
      //将src resize为(w,h) = (1440,1080) ,这里先将原图等比例缩放,保证不形变
      cv::resize(src, *dst, cv::Size(src.cols / ratio, src.rows / ratio), 0, 0, cv::INTER_NEAREST);
      if (pad_bottom || pad_right) {
        cv::copyMakeBorder(*dst, *dst, 0, pad_bottom, 0, pad_right, cv::BORDER_CONSTANT, cv::Scalar(0));
      return ratio;
    void MTCNN::nms(const std::vector<FaceRect> &proposals,
                        std::vector<FaceRect> &nmsProposals) {
      if (proposals.empty()) {
      std::vector<FaceRect> bboxes = proposals;
      std::sort(bboxes.begin(), bboxes.end(), compareBBox);
      int select_idx = 0;
      int num_bbox = bboxes.size();
      std::vector<int> mask_merged(num_bbox, 0);
      bool all_merged = false;
      while (!all_merged) {
        while (select_idx < num_bbox && 1 == mask_merged[select_idx])
        if (select_idx == num_bbox) {
          all_merged = true;
        mask_merged[select_idx] = 1;
        FaceRect select_bbox = bboxes[select_idx];
        float area1 = (select_bbox.x2 - select_bbox.x1 + 1) *
                      (select_bbox.y2 - select_bbox.y1 + 1);
        for (int i = select_idx; i < num_bbox; ++i) {
          if (mask_merged[i] == 1)
          FaceRect &bbox_i = bboxes[i];
          float x = std::max(select_bbox.x1, bbox_i.x1);
          float y = std::max(select_bbox.y1, bbox_i.y1);
          float w = std::min(select_bbox.x2, bbox_i.x2) - x + 1;
          float h = std::min(select_bbox.y2, bbox_i.y2) - y + 1;
          if (w <= 0 || h <= 0)
          float area2 = (bbox_i.x2 - bbox_i.x1 + 1) * (bbox_i.y2 - bbox_i.y1 + 1);
          float area_intersect = w * h;
          // Union method
          if (area_intersect / (area1 + area2 - area_intersect) > nms_threshold_)
            mask_merged[i] = 1;
    void MTCNN::padding(const cv::Mat &image,
                            const std::vector<FaceRect> &boxes,
                            std::vector<FaceRect> &paddings) {
      for (uint32_t i = 0; i < boxes.size(); i++) {
        int img_w = image.cols;
        int img_h = image.rows;
        FaceRect rect;
        rect.x1 = (boxes[i].x1 < 0) ? 0 : boxes[i].x1;
        rect.y1 = (boxes[i].y1 < 0) ? 0 : boxes[i].y1;
        rect.x2 = (boxes[i].x2 > img_w - 1) ? img_w - 1 : boxes[i].x2;
        rect.y2 = (boxes[i].y2 > img_h - 1) ? img_h - 1 : boxes[i].y2;
    void MTCNN::bbox2square(std::vector<FaceRect> &bboxes) {
      for (uint32_t i = 0; i < bboxes.size(); ++i) {
        float w = bboxes[i].x2 - bboxes[i].x1 + 1;
        float h = bboxes[i].y2 - bboxes[i].y1 + 1;
        float side = std::max<float>(w, h);
        bboxes[i].x1 += (w - side) * 0.5;
        bboxes[i].y1 += (h - side) * 0.5;
        bboxes[i].x2 = (int)(bboxes[i].x1 + side - 1);
        bboxes[i].y2 = (int)(bboxes[i].y1 + side - 1);
        bboxes[i].x1 = (int)(bboxes[i].x1);
        bboxes[i].y1 = (int)(bboxes[i].y1);
    void MTCNN::boxRegress(const std::vector<FaceRect> &faceRects,
                               std::vector<FaceRect> &regressedRects) {
      for (uint32_t bboxId = 0; bboxId < faceRects.size(); ++bboxId) {
        FaceRect faceRect;
        float regw = faceRects[bboxId].x2 - faceRects[bboxId].x1 + 1;
        float regh = faceRects[bboxId].y2 - faceRects[bboxId].y1 + 1;
        faceRect.x1 =
            faceRects[bboxId].x1 + regw * faceRects[bboxId].regression[0] - 1;
        faceRect.y1 =
            faceRects[bboxId].y1 + regh * faceRects[bboxId].regression[1] - 1;
        faceRect.x2 =
            faceRects[bboxId].x2 + regw * faceRects[bboxId].regression[2] - 1;
        faceRect.y2 =
            faceRects[bboxId].y2 + regh * faceRects[bboxId].regression[3] - 1;
        if (faceRect.x1 >= faceRect.x2 || faceRect.y1 >= faceRect.y2)
        faceRect.score = faceRects[bboxId].score;
        faceRect.regression = faceRects[bboxId].regression;
        faceRect.pts = faceRects[bboxId].pts;
    *	生成边界框
    *	prob:置信度向量
    *	reg:边界框回归向量
    *	scale:金字塔尺度
    *	thresh:P-Net阈值
    *	im_w,im_h:输入到P-Net的图像宽高(576,324)
    *	proposals:需要返回的候选框
    void MTCNN::generateBoundingBox(Blob *prob, Blob *reg,
                                        float scale, float thresh, int im_w,
                                        int im_h,
                                        std::vector<FaceRect> &proposals) {
      int stride = 2;
      int cellSize = 12;
      //计算卷积网路输出得尺寸,W和H大小的计算,可以根据卷积神经网络W2=(W1-F+2P)/S+1, H2=(H1-F+2P)/S+1的方式递归计算出来
      int fm_width = ceil((im_w - cellSize) * 1.0 / stride) + 1; //(576-12)*1.0/2+1 = 283
      int fm_height = ceil((im_h - cellSize) * 1.0 / stride) + 1;//(324-12)*1.0/2+1 = 157
      int offset = fm_height * fm_width;//一个特征图得长度
      //prob是一个[1,2,157,283]的向量,特征图每个点有2个值,分别表示该点是否是人脸和对应分数,prob->data()) + offset得到的是特征图对应的分数
      const float *confidence_data = reinterpret_cast<float *>(prob->data()) + offset;
      const float *reg_data = reinterpret_cast<float *>(reg->data());
      for (int y = 0; y < fm_height; ++y) {
        for (int x = 0; x < fm_width; ++x) {
          int index = y * fm_width + x;
          //cout << "generateBoundingBox: confidence_data[] = " << confidence_data[index] <<  endl;
          if (confidence_data[index] >= thresh) {
            float xTop = (int)((x * stride) / scale);
            float yTop = (int)((y * stride) / scale);
            float xBot = (int)((x * stride + cellSize - 1) / scale);
            float yBot = (int)((y * stride + cellSize - 1) / scale);
            FaceRect faceRect;
            faceRect.x1 = xTop;
            faceRect.y1 = yTop;
            faceRect.x2 = xBot;
            faceRect.y2 = yBot;
            faceRect.score = confidence_data[index];
            faceRect.regression = cv::Vec4f(
                reg_data[index], reg_data[offset + index],
                reg_data[2 * offset + index], reg_data[3 * offset + index]);
    size_t MTCNN::getBoxPerBatch(int *numBox) {
      if (*numBox >= 128) {
        *numBox = *numBox - 128;
        return 128;
      } else {  // get the left most bit
        int index = 6;
        while (index >= 0) {
          if (*numBox & (1 << index)) {
            *numBox &= ~(1 << index);
            return 1 << index;
      return 0;
    void MTCNN::wrapInputLayer(float *input_data, int c, int h, int w, std::vector<cv::Mat>* input_channels) {
      for (int i = 0; i < c; ++i) {
        cv::Mat channel(h, w, CV_32FC1, input_data);
        input_data += h * w;
    void MTCNN::classify_face(const std::vector<FaceRect> &boxes,
                                  const std::vector<FaceRect> &paddings,
                                  const cv::Mat &image,
                                  Net *net,
                                  double threshold, int flag,
                                  std::vector<FaceRect> &results) {
      int num_box = boxes.size();
      int input_width = (flag == 0) ? 24 : 48;
      int input_height = (flag == 0) ? 24 : 48;
      cv::Size dsize;
      dsize.width = input_width;
      dsize.height = input_height;
      int numBoxPerBatch = getBoxPerBatch(&num_box);
      int reg_idx = 0;
      while (numBoxPerBatch > 0) {
        float * cur_input = in_data_;
        for (int i = 0; i < numBoxPerBatch; ++i) {
          cv::Mat res;
          int pad_left = std::abs(paddings[reg_idx + i].x1 - boxes[reg_idx + i].x1);
          int pad_top = std::abs(paddings[reg_idx + i].y1 - boxes[reg_idx + i].y1);
          int pad_right = std::abs(paddings[reg_idx + i].x2 - boxes[reg_idx + i].x2);
          int pad_bottom = std::abs(paddings[reg_idx + i].y2 - boxes[reg_idx + i].y2);
          cv::Mat crop_img = image(cv::Range(paddings[reg_idx + i].y1, paddings[reg_idx + i].y2 + 1),
                                   cv::Range(paddings[reg_idx + i].x1, paddings[reg_idx + i].x2 + 1));
          cv::copyMakeBorder(crop_img, crop_img, pad_top, pad_bottom, pad_left,
                             pad_right, cv::BORDER_CONSTANT, cv::Scalar(0));
          // resize_convertTo(crop_img, res, dsize, cv::Scalar(127.5, 127.5, 127.5),
          //                 0.0078125);
          cv::resize(crop_img, res, dsize, 0, 0);
          res.convertTo(res, CV_32FC3);
          cv::Scalar mean(means_[0], means_[1], means_[2]);
          res = (res - mean) * 0.0078125;
          std::vector<cv::Mat> input_channels;
          wrapInputLayer(cur_input, 3, input_height, input_width, &input_channels);
          cur_input  += 3 * input_height * input_width;
          cv::split(res, input_channels);
        vector<Blob> input_blobs;
        shape_t input_shape = shape_t4(numBoxPerBatch, 3, input_height, input_width);
        input_blobs.push_back(Blob(in_data_, input_shape));
        int ret = net->forward(input_blobs);
        if (ret) {
          cout << "net forward failed." << endl;
        std::string outPutLayerName = ((flag == 0) ? "conv5-2" : "conv6-2");
        Blob *reg_blob = net->output(outPutLayerName);
        if (reg_blob == nullptr) {
          cout << "get output failed." << endl;
        Blob *prob_blob = net->output("prob1");
        if (prob_blob == nullptr) {
          cout << "get output failed." << endl;
        Blob *pts_blob;
        if (flag) {
          pts_blob = net->output("conv6-3");
          if (prob_blob == nullptr) {
            cout << "get output failed." << endl;
        const float *confidence_data = reinterpret_cast<float *>(prob_blob->data());
        const float *reg_data = reinterpret_cast<float *>(reg_blob->data());
        for (int i = 0; i < numBoxPerBatch; ++i) {
          if (flag) {
            //cout << "confidence_data[] = " << confidence_data[i * 2 + 1] <<  endl;
            //cout << "reg_data[] = " << reg_data[4 * i + 0] << " " << reg_data[4 * i + 1] << " " << reg_data[4 * i + 2] << " " << reg_data[4 * i + 3] <<  endl;
            //cout << "reg_data[] = " << reg_data[i * 2 + 1] <<  endl;
          if (confidence_data[i * 2 + 1] > threshold) {
            FaceRect faceRect;
            faceRect.x1 = boxes[reg_idx + i].x1;
            faceRect.y1 = boxes[reg_idx + i].y1;
            faceRect.x2 = boxes[reg_idx + i].x2;
            faceRect.y2 = boxes[reg_idx + i].y2;
            faceRect.score = confidence_data[i * 2 + 1];
            faceRect.regression = cv::Vec4f(reg_data[4 * i + 0], reg_data[4 * i + 1],
                                            reg_data[4 * i + 2], reg_data[4 * i + 3]);
            if (flag) {
              const float *points_data = reinterpret_cast<float *>(pts_blob->data());
              FacePts face_pts;
              float w = faceRect.x2 - faceRect.x1 + 1;
              float h = faceRect.y2 - faceRect.y1 + 1;
              for (int j = 0; j < 5; j++) {
                face_pts.x[j] = faceRect.x1 + points_data[j + 10 * i] * w - 1;
                face_pts.y[j] = faceRect.y1 + points_data[j + 10 * i + 5] * h - 1;
              faceRect.pts = face_pts;
        reg_idx += numBoxPerBatch;
        numBoxPerBatch = getBoxPerBatch(&num_box);
      } // while
    void MTCNN::enable_profiling(TimeStamp *ts) {
      ts_ = ts;
    *	img :w*h=640*480
    *	faceRects :[]
    void MTCNN::detect(const cv::Mat &img,
                       std::vector<FaceRect> &faceRects) {
      cv::Mat image;
      cv::Mat resized;
      //将img=(w,h)=(640,480)缩放到image = (w,h) = (1920,1080),并返回最大的缩放比例,宽高不足的部分在bottom和right填充0
      //ratio = 0.44444
      float ratio = rescale_image(img, &image);
      int width = image.cols; //1920
      int height = image.rows; //1080
      int min_wh = std::min(height, width); //1080 = std::min(1080, 1920)
      int factor_count = 0;
      double m = 12. / min_size_; //m = 0.3 = 12./40
      min_wh *= m; //min_wh = 324 = 1080 * 0.3
      std::vector<double> scales;
      * min_pyramid_size_ = 12
      * factor_ = 0.5
      * scales=[0.3,0.15,0.075,0.0375,0.01875]
      * min_wh=[162,81  ,40   ,20    ,10]
      while (min_wh >= min_pyramid_size_) {
        scales.push_back(m * std::pow(factor_, factor_count));
        min_wh *= factor_;
      std::vector<FaceRect> total_boxes;
      for (int i = 0; i < factor_count; ++i) {
        double scale = scales[i];
        int ws = std::ceil(width * scale); //1920*0.3 = 576
        int hs = std::ceil(height * scale);//1080*0.3 = 324
        cv::Size dsize;
        dsize.width = ws;
        dsize.height = hs;
                   cv::Scalar(127.5, 127.5, 127.5),
        if (ts_)
          ts_->save("resize factor #" + to_string(i));
        //将image resize到第i层金字塔的尺寸
        cv::resize(image, resized, dsize, 0, 0);
        if (ts_)
          ts_->save("resize factor #" + to_string(i));
        if (ts_)
          ts_->save("preprocess factor #" + to_string(i));
        resized.convertTo(resized, CV_32FC3);
        resized = (resized - 127.5) * 0.0078125;
        std::vector<cv::Mat> input_channels;
        wrapInputLayer(in_data_, 3, hs, ws, &input_channels);
        cv::split(resized, input_channels);
        if (ts_)
          ts_->save("preprocess factor #" + to_string(i));
        vector<Blob> input_blobs;
        if (ts_)
          ts_->save("net-forward factor #" + to_string(i));
        shape_t input_shape = shape_t4(1, 3, hs, ws);
        input_blobs.push_back(Blob(in_data_, input_shape));
        int ret = pnet_->forward(input_blobs);
        if (ret != 0) {
          cout << "net forward failed: ret = " << ret << endl;
        Blob* reg_blob = pnet_->output("conv4-2");
        if (reg_blob == nullptr) {
          cout << "get output failed." << endl;
        Blob* prob_blob = pnet_->output("prob1");
        if (prob_blob == nullptr) {
          cout << "get output failed." << endl;
        if (ts_)
          ts_->save("net-forward factor #" + to_string(i));
        std::vector<FaceRect> proposals, nmsProposals;
        generateBoundingBox(prob_blob, reg_blob, scale, thresholds_[0], ws, hs,
        if (ts_)
          ts_->save("nms factor #" + to_string(i));
        nms_threshold_ = 0.5;
        nms(proposals, nmsProposals);
        if (ts_)
          ts_->save("nms factor #" + to_string(i));
        //std::cout << "pyramid w " << ws << " h " << hs << " gen "
        //          << nmsProposals.size() << std::endl;
        total_boxes.insert(total_boxes.end(), nmsProposals.begin(),
      int num_boxes = total_boxes.size();
      if (num_boxes > 0) {
        if (ts_)
          ts_->save("box reg post PNET");
        nms_threshold_ = 0.7;
        std::vector<FaceRect> temp, paddings;
        nms(total_boxes, temp);
        boxRegress(temp, total_boxes);
        padding(image, total_boxes, paddings);
        if (ts_)
          ts_->save("box reg post PNET");
        //std::cout << "PNet generate " << total_boxes.size() << std::endl;
        if (ts_)
        classify_face(total_boxes, paddings, image, rnet_, thresholds_[1], 0, temp);
        if (ts_)
        std::vector<FaceRect> temp1;
        if (ts_)
          ts_->save("box reg post RNET");
        nms(temp, temp1);
        boxRegress(temp1, total_boxes);
        padding(image, total_boxes, paddings);
        if (ts_)
          ts_->save("box reg post RNET");
        num_boxes = total_boxes.size();
        //std::cout << "RNet generate " << num_boxes << std::endl;
        if (num_boxes > 0) {
          if (ts_)
          classify_face(total_boxes, paddings, image, onet_, thresholds_[2], 1,
          if (ts_)
          //cout << "after classify_face: " << temp.size() << endl;
          if (ts_)
            ts_->save("box reg post ONET");
          boxRegress(temp, temp1);
          nms_threshold_ = 0.5;
          nms(temp1, faceRects);
          for (size_t i = 0; i < faceRects.size(); ++i) {
            int h = image.rows;
            int w = image.cols;
            faceRects[i].x1 = (faceRects[i].x1 < 0 ? 0 : faceRects[i].x1) * ratio;
            faceRects[i].y1 = (faceRects[i].y1 < 0 ? 0 : faceRects[i].y1) * ratio;
            faceRects[i].x2 = (faceRects[i].x2 > w - 1 ? w - 1 : faceRects[i].x2) * ratio;
            faceRects[i].y2 = (faceRects[i].y2 > h - 1 ? h - 1 : faceRects[i].y2) * ratio;
            for (int j = 0; j < 5; j++) {
              faceRects[i].pts.x[j] *= ratio;
              faceRects[i].pts.y[j] *= ratio;
          if (ts_)
            ts_->save("box reg post ONET");
      //cout << "final predict " << faceRects.size() << " bboxes" << endl << endl;


