zoukankan      html  css  js  c++  java
  • Yolov4性能分析(下)

    Yolov4性能分析(下)

     

    六. 权重更新

    "darknet/src/detector.c"--train_detector()函数中:

     

            ......

            /* 开始训练网络 */

            float loss = 0;

    #ifdef GPU

            if (ngpus == 1) {

                int wait_key = (dont_show) ? 0 : 1;

                loss = train_network_waitkey(net, train, wait_key); // network.c中,train_network_waitkey函数入口,分配内存并执行网络训练。

            }

            else {

                loss = train_networks(nets, ngpus, train, 4); // network_kernels.cu中,train_networks函数入口,多GPU训练。

            }

    #else

            loss = train_network(net, train); // train_network_waitkey(net, d, 0),CPU模式。

    #endif

            if (avg_loss < 0 || avg_loss != avg_loss) avg_loss = loss;    // if(-inf or nan)

            avg_loss = avg_loss*.9 + loss*.1;

            ......

     

    以CPU训练为例,"darknet/src/network.c"--train_network()函数,执行train_network_waitkey(net, d, 0):

     

    float train_network_waitkey(network net, data d, int wait_key)

    {

        assert(d.X.rows % net.batch == 0);

        int batch = net.batch; // detector.c中train_detector函数在nets[k] = parse_network_cfg(cfgfile)处调用parser.c中的parse_net_options函数,有net->batch /= subdivs,所以batch_size = batch/subdivisions。

        int n = d.X.rows / batch; // batch个数, 对于单GPU和CPU,n = subdivision。

        float* X = (float*)xcalloc(batch * d.X.cols, sizeof(float));

        float* y = (float*)xcalloc(batch * d.y.cols, sizeof(float));

        int i;

        float sum = 0;

        for(i = 0; i < n; ++i){

            get_next_batch(d, batch, i*batch, X, y);

            net.current_subdivision = i;

            float err = train_network_datum(net, X, y); // 调用train_network_datum函数得到误差Loss。

            sum += err;

            if(wait_key) wait_key_cv(5);

        }

        (*net.cur_iteration) += 1;

    #ifdef GPU

        update_network_gpu(net);

    #else   // GPU

        update_network(net);

    #endif  // GPU

        free(X);

        free(y);

        return (float)sum/(n*batch);

    }

     

    其中,调用train_network_datum()函数计算error是核心:

     

    float train_network_datum(network net, float *x, float *y)

    {

    #ifdef GPU

        if(gpu_index >= 0) return train_network_datum_gpu(net, x, y); // GPU模式,调用network_kernels.cu中train_network_datum_gpu函数。

    #endif

        network_state state={0};

        *net.seen += net.batch;

        state.index = 0;

        state.net = net;

        state.input = x;

        state.delta = 0;

        state.truth = y;

        state.train = 1;

        forward_network(net, state); // CPU模式,正向传播。

        backward_network(net, state); // CPU模式,BP。

        float error = get_network_cost(net); // 计算Loss。

        return error;

    }

     

     

     

     进一步分析forward_network()函数:

     

    void forward_network(network net, network_state state)

    {

        state.workspace = net.workspace;

        int i;

        for(i = 0; i < net.n; ++i){

            state.index = i;

            layer l = net.layers[i];

            if(l.delta && state.train){

                scal_cpu(l.outputs * l.batch, 0, l.delta, 1); // blas.c中,scal_cpu函数入口。

            }

            l.forward(l, state); // 不同层l.forward代表不同函数,如:convolutional_layer.c中,l.forward = forward_convolutional_layer;yolo_layer.c中,l.forward = forward_yolo_layer,CPU执行前向运算。

            state.input = l.output; // 上一层的输出传递给下一层的输入。

        }

    }

     

    卷积层时,forward_convolutional_layer()函数:

     

    void forward_convolutional_layer(convolutional_layer l, network_state state)

    {
        /* 获取卷积层输出的长宽。*/

        int out_h = convolutional_out_height(l);

        int out_w = convolutional_out_width(l);

        int i, j;

        fill_cpu(l.outputs*l.batch, 0, l.output, 1); // 把output初始化为0。
        /* xnor-net,将inputs和weights二值化。*/

        if (l.xnor && (!l.align_bit_weights || state.train)) {

            if (!l.align_bit_weights || state.train) {

                binarize_weights(l.weights, l.n, l.nweights, l.binary_weights);

            }

            swap_binary(&l);

            binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);

            state.input = l.binary_input;

        }
        /* m是卷积核的个数,k是每个卷积核的参数数量(l.size是卷积核的大小),n是每个输出feature map的像素个数。*/

        int m = l.n / l.groups;

        int k = l.size*l.size*l.c / l.groups;

        int n = out_h*out_w;

        static int u = 0;

        u++;

        for(i = 0; i < l.batch; ++i)

        {

            for (j = 0; j < l.groups; ++j)

            {
                /* weights是卷积核的参数,a是指向权重的指针,b是指向工作空间指针,c是指向输出的指针。*/

                float *a = l.weights +j*l.nweights / l.groups;

                float *b = state.workspace;

                float *c = l.output +(i*l.groups + j)*n*m;

                if (l.xnor && l.align_bit_weights && !state.train && l.stride_x == l.stride_y)

                {

                    memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float));

                    if (l.c % 32 == 0)

                    {

                        int ldb_align = l.lda_align;

                        size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;

                        int re_packed_input_size = l.c * l.w * l.h;

                        memset(state.workspace, 0, re_packed_input_size * sizeof(float));

                        const size_t new_c = l.c / 32;

                        size_t in_re_packed_input_size = new_c * l.w * l.h + 1;

                        memset(l.bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));

                        // float32x4 by channel (as in cuDNN)

                        repack_input(state.input, state.workspace, l.w, l.h, l.c);

                        // 32 x floats -> 1 x uint32_t

                        float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input, l.c * l.w * l.h);
                        /* image to column,就是将图像依照卷积核的大小拉伸为列向量,方便矩阵运算,将图像每一个kernel转换成一列。*/

                        im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);

                        int new_k = l.size*l.size*l.c / 32;

                        transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input, new_k, n, n, new_ldb);
                        /* General Matrix Multiply函数,实现矩阵运算,也就是卷积运算。*/

                        gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);

                    }

                    else

                    {

                        im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);

                        // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits)

                        {

                            int ldb_align = l.lda_align;

                            size_t new_ldb = k + (ldb_align - k%ldb_align);

                            size_t t_intput_size = binary_transpose_align_input(k, n, state.workspace, &l.t_bit_input, ldb_align, l.bit_align);

                            // 5x times faster than gemm()-float32

                            gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);

                        }

                    }

                    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); //添加偏移项。

                    /* 非线性变化,leaky RELU、Mish等激活函数。*/

                    if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);

                    else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);

                    else if (l.activation == NORM_CHAN) activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);

                    else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 0);

                    else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 1);

                    else activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);

                    return;

                }

                else {

                    float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w;

                    if (l.size == 1) {

                        b = im;

                    }

                    else {

                        im2col_cpu_ext(im,   // input

                            l.c / l.groups,     // input channels

                            l.h, l.w,           // input size (h, w)

                            l.size, l.size,     // kernel size (h, w)

                            l.pad * l.dilation, l.pad * l.dilation,       // padding (h, w)

                            l.stride_y, l.stride_x, // stride (h, w)

                            l.dilation, l.dilation, // dilation (h, w)

                            b);                 // output

                    }

                    gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);

                    // bit-count to float

                }

            }

        }

        if(l.batch_normalize){ // BN层,加速收敛。

            forward_batchnorm_layer(l, state);

        }

        else { // 直接加上bias,output += bias。

            add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);

        }
        /* 非线性变化,leaky RELU、Mish等激活函数。*/

        if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);

        else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);

        else if (l.activation == NORM_CHAN) activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);

        else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 0);

        else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 1);

        else activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);

        if(l.binary || l.xnor) swap_binary(&l); // 二值化。

        if(l.assisted_excitation && state.train) assisted_excitation_forward(l, state);

        if (l.antialiasing) {

            network_state s = { 0 };

            s.train = state.train;

            s.workspace = state.workspace;

            s.net = state.net;

            s.input = l.output;

            forward_convolutional_layer(*(l.input_layer), s);

            memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float));

        }

    }

     

    yolo层时,forward_yolo_layer()函数:

     

    void forward_yolo_layer(const layer l, network_state state)

    {

        int i, j, b, t, n;

        memcpy(l.output, state.input, l.outputs*l.batch * sizeof(float)); // 将层输入直接copy到层输出。
    /* 在cpu模式,把预测输出的x,y,confidence和所有类别都sigmoid激活,确保值在0~1之间。*/

    #ifndef GPU

        for (b = 0; b < l.batch; ++b) {

            for (n = 0; n < l.n; ++n) {

                int index = entry_index(l, b, n*l.w*l.h, 0); // 获取第b个batch开始的index。
                /* 对预测的tx,ty进行逻辑回归。*/

                activate_array(l.output + index, 2 * l.w*l.h, LOGISTIC);        // x,y,

                scal_add_cpu(2 * l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output + index, 1);    // scale x,y

                index = entry_index(l, b, n*l.w*l.h, 4); // 获取第b个batch confidence开始的index。

                activate_array(l.output + index, (1 + l.classes)*l.w*l.h, LOGISTIC); // 对预测的confidence以及class进行逻辑回归。

            }

        }

    #endif

        // delta is zeroed

        memset(l.delta, 0, l.outputs * l.batch * sizeof(float)); // 将yolo层的误差项进行初始化(包含整个batch的)。

        if (!state.train) return; // 不是训练阶段,return。

        float tot_iou = 0; // 总的IOU。

        float tot_giou = 0;

        float tot_diou = 0;

        float tot_ciou = 0;

        float tot_iou_loss = 0;

        float tot_giou_loss = 0;

        float tot_diou_loss = 0;

        float tot_ciou_loss = 0;

        float recall = 0;

        float recall75 = 0;

        float avg_cat = 0;

        float avg_obj = 0;

        float avg_anyobj = 0;

        int count = 0;

        int class_count = 0;

        *(l.cost) = 0; // yolo层的总损失初始化为0。

        for (b = 0; b < l.batch; ++b) { // 遍历batch中的每一张图片。

            for (j = 0; j < l.h; ++j) {

                for (i = 0; i < l.w; ++i) { // 遍历每个Grid cell, 当前cell编号[j, i]。

                    for (n = 0; n < l.n; ++n) { // 遍历每一个bbox,当前bbox编号[n]。

                        const int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1); // 预测b-box类别s下标。 const int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4); // 预测b-box objectness下标。

                        const int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); // 获得第j*w+i个cell第n个b-box的index。

                        const int stride = l.w*l.h;
                        /* 计算第j*w+i个cell第n个b-box在当前特征图上的相对位置[x,y],在网络输入图片上的相对宽度、高度[w,h]。*/

                        box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);

                        float best_match_iou = 0;

                        int best_match_t = 0;

                        float best_iou = 0; // 保存最大IOU。

                        int best_t = 0; // 保存最大IOU的bbox id。

                        for (t = 0; t < l.max_boxes; ++t) { // 遍历每一个GT bbox。

                            box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1); // 将第t个bbox由float数组转bbox结构体,方便计算IOU。

                            int class_id = state.truth[t*(4 + 1) + b*l.truths + 4]; // 获取第t个bbox的类别,检查是否有标注错误。

                            if (class_id >= l.classes || class_id < 0) {

                                printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] ", class_id, l.classes, l.classes - 1);

                                printf(" truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d ", truth.x, truth.y, truth.w, truth.h, class_id);

                                if (check_mistakes) getchar();

                                continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value

                            }

                            if (!truth.x) break;  // 如果x坐标为0则break,因为定义了max_boxes个b-box。

                            float objectness = l.output[obj_index]; // 预测bbox object置信度。

                            if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0;
                            /* 获得预测b-box的类别信息,如果某个类别的概率超过0.25返回1。*/

                            int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);

                            float iou = box_iou(pred, truth); // 计算pred b-box与第t个GT bbox之间的IOU。

                            if (iou > best_match_iou && class_id_match == 1) { // class_id_match=1的限制,即预测b-box的置信度必须大于0.25。

                                best_match_iou = iou;

                                best_match_t = t;

                            }

                            if (iou > best_iou) {

                                best_iou = iou; // 更新最大的IOU。

                                best_t = t; // 记录该GT b-box的编号t。

                            }

                        }

                        avg_anyobj += l.output[obj_index]; // 统计pred b-box的confidence。

                        l.delta[obj_index] = l.cls_normalizer * (0 - l.output[obj_index]); // 将所有pred b-box都当做noobject, 计算其confidence梯度,cls_normalizer是平衡系数。

                        if (best_match_iou > l.ignore_thresh) { // best_iou大于阈值则说明pred box有物体。

                            const float iou_multiplier = best_match_iou*best_match_iou;// (best_match_iou - l.ignore_thresh) / (1.0 - l.ignore_thresh);

                            if (l.objectness_smooth) {

                                l.delta[obj_index] = l.cls_normalizer * (iou_multiplier - l.output[obj_index]);

                                int class_id = state.truth[best_match_t*(4 + 1) + b*l.truths + 4];

                                if (l.map) class_id = l.map[class_id];

                                const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;

                                l.delta[class_index + stride*class_id] = class_multiplier * (iou_multiplier - l.output[class_index + stride*class_id]);

                            }

                            else l.delta[obj_index] = 0;

                        }

                        else if (state.net.adversarial) { // 自对抗训练。

                            int stride = l.w*l.h;

                            float scale = pred.w * pred.h;

                            if (scale > 0) scale = sqrt(scale);

                            l.delta[obj_index] = scale * l.cls_normalizer * (0 - l.output[obj_index]);

                            int cl_id;

                            for (cl_id = 0; cl_id < l.classes; ++cl_id) {

                                if(l.output[class_index + stride*cl_id] * l.output[obj_index] > 0.25)

                                    l.delta[class_index + stride*cl_id] = scale * (0 - l.output[class_index + stride*cl_id]);

                            }

                        }

                        if (best_iou > l.truth_thresh) { // pred b-box为完全预测正确样本,cfg中truth_thresh=1,语句永远不可能成立。

                            const float iou_multiplier = best_iou*best_iou;// (best_iou - l.truth_thresh) / (1.0 - l.truth_thresh);

                            if (l.objectness_smooth) l.delta[obj_index] = l.cls_normalizer * (iou_multiplier - l.output[obj_index]);

                            else l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);

                            int class_id = state.truth[best_t*(4 + 1) + b*l.truths + 4];

                            if (l.map) class_id = l.map[class_id];

                            delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0, l.focal_loss, l.label_smooth_eps, l.classes_multipliers);

                            const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;

                            if (l.objectness_smooth) l.delta[class_index + stride*class_id] = class_multiplier * (iou_multiplier - l.output[class_index + stride*class_id]);

                            box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);

                            delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);

                        }

                    }

                }

            }

            for (t = 0; t < l.max_boxes; ++t) { // 遍历每一个GT box。

                box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1); // 将第t个b-box由float数组转b-box结构体,方便计算IOU。

                if (truth.x < 0 || truth.y < 0 || truth.x > 1 || truth.y > 1 || truth.w < 0 || truth.h < 0) {

                    char buff[256];

                    printf(" Wrong label: truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f ", truth.x, truth.y, truth.w, truth.h);

                    sprintf(buff, "echo "Wrong label: truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f" >> bad_label.list",

                        truth.x, truth.y, truth.w, truth.h);

                    system(buff);

                }

                int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];

                if (class_id >= l.classes || class_id < 0) continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value

                if (!truth.x) break;  // 如果x坐标为0则取消,定义了max_boxes个bbox,可能实际上没那么多。

                float best_iou = 0; // 保存最大的IOU。

                int best_n = 0; // 保存最大IOU的b-box index。

                i = (truth.x * l.w); // 获得当前t个GT b-box所在的cell。

                j = (truth.y * l.h);

                box truth_shift = truth;

                truth_shift.x = truth_shift.y = 0; // 将truth_shift的box位置移动到0,0。

                for (n = 0; n < l.total; ++n) { // 遍历每一个anchor b-box找到与GT b-box最大的IOU。

                    box pred = { 0 };

                    pred.w = l.biases[2 * n] / state.net.w; // 计算pred b-box的w在相对整张输入图片的位置。

                    pred.h = l.biases[2 * n + 1] / state.net.h; // 计算pred bbox的h在相对整张输入图片的位置。

                    float iou = box_iou(pred, truth_shift); // 计算GT box truth_shift与预测b-box pred二者之间的IOU。

                    if (iou > best_iou) {

                        best_iou = iou; // 记录最大的IOU。

                        best_n = n; // 记录该b-box的编号n。

                    }

                }

                int mask_n = int_index(l.mask, best_n, l.n); // 上面记录b-box的编号,是否由该层Anchor预测的。

                if (mask_n >= 0) {

                    int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];

                    if (l.map) class_id = l.map[class_id];

                    int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0); // 获得best_iou对应anchor box的index。

                    const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f; // 控制样本数量不均衡,即Focal Loss中的alpha。

                    ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta); // 计算best_iou对应Anchor bbox的[x,y,w,h]的梯度。
                    /* 模板检测最新的工作,metricl learning,包括IOU/GIOU/DIOU/CIOU Loss等。*/

                    // range is 0 <= 1

                    tot_iou += all_ious.iou;

                    tot_iou_loss += 1 - all_ious.iou;

                    // range is -1 <= giou <= 1

                    tot_giou += all_ious.giou;

                    tot_giou_loss += 1 - all_ious.giou;

                    tot_diou += all_ious.diou;

                    tot_diou_loss += 1 - all_ious.diou;

                    tot_ciou += all_ious.ciou;

                    tot_ciou_loss += 1 - all_ious.ciou;

                    int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4); // 获得best_iou对应anchor box的confidence的index。

                    avg_obj += l.output[obj_index]; // 统计confidence。

                    l.delta[obj_index] = class_multiplier * l.cls_normalizer * (1 - l.output[obj_index]); // 计算confidence的梯度。

                    int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1); // 获得best_iou对应GT box的class的index。

                    delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat, l.focal_loss, l.label_smooth_eps, l.classes_multipliers); // 获得best_iou对应anchor box的class的index。

                    ++count;

                    ++class_count;

                    if (all_ious.iou > .5) recall += 1;

                    if (all_ious.iou > .75) recall75 += 1;

                }

                // iou_thresh

                for (n = 0; n < l.total; ++n) {

                    int mask_n = int_index(l.mask, n, l.n);

                    if (mask_n >= 0 && n != best_n && l.iou_thresh < 1.0f) {

                        box pred = { 0 };

                        pred.w = l.biases[2 * n] / state.net.w;

                        pred.h = l.biases[2 * n + 1] / state.net.h;

                        float iou = box_iou_kind(pred, truth_shift, l.iou_thresh_kind); // IOU, GIOU, MSE, DIOU, CIOU

                        // iou, n

                        if (iou > l.iou_thresh) {

                            int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];

                            if (l.map) class_id = l.map[class_id];

                            int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);

                            const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;

                            ious all_ious = delta_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);

                            // range is 0 <= 1

                            tot_iou += all_ious.iou;

                            tot_iou_loss += 1 - all_ious.iou;

                            // range is -1 <= giou <= 1

                            tot_giou += all_ious.giou;

                            tot_giou_loss += 1 - all_ious.giou;

                            tot_diou += all_ious.diou;

                            tot_diou_loss += 1 - all_ious.diou;

                            tot_ciou += all_ious.ciou;

                            tot_ciou_loss += 1 - all_ious.ciou;

                            int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);

                            avg_obj += l.output[obj_index];

                            l.delta[obj_index] = class_multiplier * l.cls_normalizer * (1 - l.output[obj_index]);

                            int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);

                            delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat, l.focal_loss, l.label_smooth_eps, l.classes_multipliers);

                            ++count;

                            ++class_count;

                            if (all_ious.iou > .5) recall += 1;

                            if (all_ious.iou > .75) recall75 += 1;

                        }

                    }

                }

            }

            // averages the deltas obtained by the function: delta_yolo_box()_accumulate

            for (j = 0; j < l.h; ++j) {

                for (i = 0; i < l.w; ++i) {

                    for (n = 0; n < l.n; ++n) {

                        int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); // 获得第j*w+i个cell第n个b-box的index。

                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1); // 获得第j*w+i个cell第n个b-box的类别。

                        const int stride = l.w*l.h; // 特征图的大小。

                        averages_yolo_deltas(class_index, box_index, stride, l.classes, l.delta); // 对梯度进行平均。

                    }

                }

            }

        }

        ......

          // gIOU loss + MSE (objectness) loss
          if (l.iou_loss == MSE) {
          *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
          }
          else {
          // Always compute classification loss both for iou + cls loss and for logging with mse loss
          // TODO: remove IOU loss fields before computing MSE on class
          // probably split into two arrays
          if (l.iou_loss == GIOU) {
          avg_iou_loss = count > 0 ? l.iou_normalizer * (tot_giou_loss / count) : 0; // 平均IOU损失,参考上面代码,tot_iou_loss += 1 - all_ious.iou。
          }
          else {
          avg_iou_loss = count > 0 ? l.iou_normalizer * (tot_iou_loss / count) : 0; // 平均IOU损失,参考上面代码,tot_iou_loss += 1 - all_ious.iou。
          }
          *(l.cost) = avg_iou_loss + classification_loss; // Loss值传递给l.cost,IOU与分类损失求和。
          }

          loss /= l.batch; // 平均Loss。
          classification_loss /= l.batch;
          iou_loss /= l.batch;

          ......

    }

     

    再来分析backward_network()函数:

     

     void backward_network(network net, network_state state)
     {
        int i;
        float *original_input = state.input;
        float *original_delta = state.delta;
        state.workspace = net.workspace;
        for(i = net.n-1; i >= 0; --i){
            state.index = i;
            if(i == 0){
                state.input = original_input;
                state.delta = original_delta;
            }else{
                layer prev = net.layers[i-1];
                state.input = prev.output;
                state.delta = prev.delta; // delta是指针变量,对state.delta做修改,就相当与对prev层的delta做了修改。
            }
            layer l = net.layers[i];
            if (l.stopbackward) break;
            if (l.onlyforward) continue;
            l.backward(l, state); // 不同层l.backward代表不同函数,如:convolutional_layer.c中,l.backward = backward_convolutional_layer;yolo_layer.c中,l.backward = backward_yolo_layer,CPU执行反向传播。
        }
     }

     

    卷积层时,backward_convolutional_layer()函数:

     

    void backward_convolutional_layer(convolutional_layer l, network_state state)

    {

        int i, j;
        /* m是卷积核的个数,k是每个卷积核的参数数量(l.size是卷积核的大小),n是每个输出feature map的像素个数。*/

        int m = l.n / l.groups;

        int n = l.size*l.size*l.c / l.groups;

        int k = l.out_w*l.out_h;
        /* 更新delta。*/

        if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.delta);

        else if (l.activation == MISH) gradient_array_mish(l.outputs*l.batch, l.activation_input, l.delta);

        else if (l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) gradient_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);

        else if (l.activation == NORM_CHAN) gradient_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);

        else gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);

        if (l.batch_normalize) { // BN层,加速收敛。

            backward_batchnorm_layer(l, state);

        }

        else { // 直接加上bias。

            backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);

        }

        for (i = 0; i < l.batch; ++i) {

            for (j = 0; j < l.groups; ++j) {

                float *a = l.delta + (i*l.groups + j)*m*k;

                float *b = state.workspace;

                float *c = l.weight_updates + j*l.nweights / l.groups;
                /* 进入本函数之前,在backward_network()函数中,已经将net.input赋值为prev.output,若当前层为第l层,则net.input为第l-1层的output。*/

                float *im = state.input + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w;

                im2col_cpu_ext(

                    im,                 // input

                    l.c / l.groups,     // input channels

                    l.h, l.w,           // input size (h, w)

                    l.size, l.size,     // kernel size (h, w)

                    l.pad * l.dilation, l.pad * l.dilation,       // padding (h, w)

                    l.stride_y, l.stride_x, // stride (h, w)

                    l.dilation, l.dilation, // dilation (h, w)

                    b);                 // output

                gemm(0, 1, m, n, k, 1, a, k, b, k, 1, c, n); // 计算当前层weights更新。
                /* 计算上一层的delta,进入本函数之前,在backward_network()函数中,已经将net.delta赋值为prev.delta,若当前层为第l层,则net.delta为第l-1层的delta。*/

                if (state.delta) {

                    a = l.weights + j*l.nweights / l.groups;

                    b = l.delta + (i*l.groups + j)*m*k;

                    c = state.workspace;

                    gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);

                    col2im_cpu_ext(

                        state.workspace,        // input

                        l.c / l.groups,         // input channels (h, w)

                        l.h, l.w,               // input size (h, w)

                        l.size, l.size,         // kernel size (h, w)

                        l.pad * l.dilation, l.pad * l.dilation,           // padding (h, w)

                        l.stride_y, l.stride_x,     // stride (h, w)

                        l.dilation, l.dilation, // dilation (h, w)

                        state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta)

                }

            }

        }

    }

     

    yolo层时,backward_yolo_layer()函数:

    void backward_yolo_layer(const layer l, network_state state)

    {

       axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1); // 直接把l.delta拷贝给上一层的delta。注意 net.delta 指向 prev_layer.delta。

    }

    正向、反向传播后,通过get_network_cost()函数计算Loss:

     

    float get_network_cost(network net)

    {

        int i;

        float sum = 0;

        int count = 0;

        for(i = 0; i < net.n; ++i){

            if(net.layers[i].cost){ // 获取各层的损失,只有detection层,也就是yolo层,有cost。

                sum += net.layers[i].cost[0]; // Loss总和存在cost[0]中,见cost_layer.c中forward_cost_layer()函数。

                ++count;

            }

        }

        return sum/count; // 返回平均损失。

    }

    CIOU_Loss是创新点,与GIOU_Loss相比,引入了重叠面积与中心点的距离Dis_2来区分预测框a与b的定位差异,同时还引入了预测框和目标框的长宽比一致性因子ν,将a与c这种重叠面积与中心点距离相同但长宽比与目标框适配程度有差异的预测框区分开来,如图:

     

    计算好Loss需要update_network():

     

    void update_network(network net)
    {
        int i;
        int update_batch = net.batch*net.subdivisions;
        float rate = get_current_rate(net);
        for(i = 0; i < net.n; ++i){
            layer l = net.layers[i];
            if(l.update){
                l.update(l, update_batch, rate, net.momentum, net.decay); // convolutional_layer.c中,l.update = update_convolutional_layer。
            }
        }
    }

     

    update_convolutional_layer()函数:

     

    void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate_init, float momentum, float decay)

    {

        float learning_rate = learning_rate_init*l.learning_rate_scale;

        axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1); // blas.c中,axpy_cpu函数入口,for(i = 0; i < l.nweights; ++i),l.weight_updates[i*1] -= decay*batch*l.weights[i*1]。

        axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1); // for(i = 0; i < l.nweights; ++i),l.weights[i*1] += (learning_rate/batch)*l.weight_updates[i*1]

        scal_cpu(l.nweights, momentum, l.weight_updates, 1); // blas.c中,scal_cpu函数入口,for(i = 0; i < l.nweights; ++i),l.weight_updates[i*1] *= momentum。

        axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1); // for(i = 0; i < l.n; ++i),l.biases[i*1] += (learning_rate/batch)*l.bias_updates[i*1]。

        scal_cpu(l.n, momentum, l.bias_updates, 1); // for(i = 0; i < l.n; ++i),l.bias_updates[i*1] *= momentum。

        if (l.scales) {

            axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1);

            scal_cpu(l.n, momentum, l.scale_updates, 1);

        }

    }

     

    同样,在network_kernels.cu里,有GPU模式下的forward&backward相关的函数,涉及数据格式转换及加速,此处只讨论原理,暂时忽略GPU部分的代码。

     

    void forward_backward_network_gpu(network net, float *x, float *y)
    {
    ......
        forward_network_gpu(net, state); // 正向。
        backward_network_gpu(net, state); // 反向。
    ......
    }

     

    CPU模式下,采用带momentum的常规GD更新weights,同时在network.c中也提供了也提供了train_network_sgd()函数接口;GPU模式提供了adam选项,convolutional_layer.c中make_convolutional_layer()函数有体现。

    . 调参总结

    本人在实际项目中涉及的是工业中的钢铁表面缺陷检测场景,不到2000张图片,3类,数据量很少。理论上YOLO系列并不太适合缺陷检测的问题,基于分割+分类的网络、Cascade-RCNN等或许是更好的选择,但我本着实验的态度,进行了多轮的训练和对比,整体上效果还是不错的。

    1.max_batches: AlexeyAB在github工程上有提到,类别数*2000作为参考,不要少于6000,但这个是使用预训练权重的情况。如果train from scratch,要适当增加,具体要看你的数据情况,网络需要额外的时间来从零开始学习;

    2.pretrain or not当数据量很少时,预训练确实能更快使模型收敛,效果也不错,但缺陷检测这类问题,缺陷目标特征本身的特异性还是比较强的,虽然我的数据量也很少,但scratch的方式还是能取得稍好一些的效果;

    3.anchorscfg文件默认的anchors是基于COCO数据集,可以说尺度比较均衡,使用它效果不会差,但如果你自己的数据在尺度分布上不太均衡,建议自行生成新的anchors,可以直接使用源码里面的脚本,注意,要根据生成anchors的size(1-yolo:<30*30,2-yolo:<60*60,3-yolo:others)来改变索引值masks以及前一个conv层的filters参数;

    4.rotateYOLO-V4在目标检测这一块,其实没有用到旋转来进行数据增强,因此我在线下对数量最少的一个类进行了180旋转对称增强,该类样本数扩增一倍,效果目前还不明显,可能是数据量增加的还是太少,而且我还在训练对比,完成后可以补充;

    5.mosaic马赛克数据增强是必须要有的,mAP值提升比较明显,需要安装opencv,且和cutmix不能同时使用。

     

    1. Draw object:

    #if defined(OPENCV) && defined(GPU)

    read_data_cfg

    option_find_str

    get_labels_custom

    load_alphabet

    parse_network_cfg

    parse_network_cfg_custom

    set_batch_network

    load_weights

    load_image

    resize_image

    copy_image

    cv_draw_object

    basecfg

    draw_train_chart

    forward_backward_network_gpu

    draw_train_loss

    crop_image

    copy_image_inplace

    embed_image

    show_image_cv

    quantize_image

    network_predict

    save_image_png

    get_network_boxes

    do_nms_sort

    diounms_sort

    draw_detections_v3

    save_image

    1. calc_anchors

    read_data_cfg

    option_find_str

    get_paths

    list_to_array

    option_find_int

    replace_image_to_label

    read_boxes

    counter_per_class

    calculating k-means++

    make_matrix

    do_kmeans

    show_anhors

    1. validate_detector_recall

    parse_network_cfg_custom

    load_weights

    fuse_conv_batchnorm

    load_image

    resize_image

    basecfg

    network_predict

    get_network_boxes

    do_nms_obj

    replace_image_to_label

    read_boxes

    1. validate_detector_map

    read_data_cfg

    option_find_str

    get_labels_custom

    read_map

    remember_network_recurrent_state

    free_network_recurrent_state

    parse_network_cfg_custom

    load_weights

    fuse_conv_batchnorm

    calculate_binary_weights

    get_paths

    list_to_array

    // For multi-class precision and recall computation

    load_data_in_thread

    pthread_join

    load_data_in_thread

    basecfg

    network_predict

    get_network_boxes

    do_nms_sort

    diounms_sort

    set_track_id

    replace_image_to_label

    read_boxes

    SORT(detections)

    // for PR-curve

    // correct mAP calculation: ImageNet, PascalVOC 2010-2012

    //add remaining area of PR curve when recall isn't 0 at rank-1

    // free memory

    restore_network_recurrent_state

    return mean_average_precision;

    1. train_detector

    read_data_cfg

    option_find_str

    cuda_set_device

    parse_network_cfg_custom

    get_labels_custom

    basecfg

    cuda_set_device

    parse_network_cfg

    get_current_iteration

    draw_train_chart

    load_data

    rand_scale

    // at the beginning (check if enough memory) and at the end (calc rolling mean/variance)

    pthread_join

    load_data

    resize_network

    pthread_join

    load_data

    float_to_box

    float_to_image

    train_network_waitkey

    train_networks

    get_current_iteration

    // calculate mAP for each 4 Epochs

    resize_network

    copy_weights_net

    // combine Training and Validation networks

    draw_train_loss

    sync_nets

    save_weights

    // free memory

    //free_network(net);

    1. test_detector

    read_data_cfg

    option_find_str

    get_labels_custom

    parse_network_cfg_custom

    load_weights

    fuse_conv_batchnorm(net);

    calculate_binary_weights(net);

    letterbox_image

    resize_image

    get_network_boxes

    do_nms_sort

    diounms_sort

    draw_detections_v3

    save_image

    show_image

    detection_to_json

    replace_image_to_label

    // free memory

    1. demo

    parse_network_cfg_custom

    load_weights

    fuse_conv_batchnorm(net);

    calculate_binary_weights(net);

    get_capture_video_stream

    get_capture_webcam

    custom_create_thread

    fetch_in_thread_sync

    detect_in_thread_sync

    create_window_cv

    create_video_writer

    get_time_point

    custom_atomic_store_int

    do_nms_obj

    diounms_sort

    set_track_id

    send_json

    send_http_post_request

    draw_detections_cv_v3

    max_val_cmp

    send_mjpeg

    write_frame_cv

    this_thread_yield

    release_video_writer

    //free memory and thread

    Coco依赖的软件:coco.data,yolov4.cfg,yolov4.weights

    1. duration_make_convolutional_layer: 336607

    convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation, int deform, int train)

    1) Preprocessing:Blur,antialiasing

    2) cuda_make_array

    3) get_convolutional_workspace_size

    4) make_convolutional_layer

    5) push_convolutional_layer,read_weights,

    1. duration_run_detector_demo: 339565723

    1) read_data_cfg(datacfg);

    2) option_find_int(options, "classes", 20);

    3) option_find_str(options, "names", "data/names.list");

    4) get_labels(name_list);

    5) demo(cfg, weights, thresh, hier_thresh, cam_index, filename, names, classes, avgframes, frame_skip, prefix, out_filename,

                mjpeg_port, dontdraw_bbox, json_port, dont_show, ext_output, letter_box, time_limit_sec, http_post_host, benchmark, benchmark_layers);

    1. duration_main_run_detector: 339565785

    run_detector(argc, argv);

    1) find_arg, find_int_arg, find_char_arg, find_float_arg

    2) test_detector,

    3) train_detector,

    4) validate_detector, validate_detector_recall, validate_detector_map

    5) calc_anchors

    6) draw_object

    7) demo(read_data_cfg, option_find_int, option_find_str, get_labels, free_list_contents_kvp, free_list, free(gpus))

    1. duration_make_yolo_layer: 5110

    make_yolo_layer

    1) forward_yolo_layer;

    activate_array,scal_add_cpu,entry_index,get_yolo_box,float_to_box_stride,compare_yolo_class,box_iou,delta_yolo_box,delta_yolo_box()_accumulate,averages_yolo_deltas,compute classification loss

    2) backward_yolo_layer; backward_yolo_layer_gpu;

    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);

    axpy_ongpu(l.batch*l.inputs, state.net.loss_scale * l.delta_normalizer, l.delta_gpu, 1, state.delta, 1);

    3)cuda_make_array

    4)cudaHostAlloc

    5. duration_parse_network_cfg_custom: 2007

    :parse_network_cfg_custom(char *filename, int batch, int time_steps)

    1) read_cfg(filename);

    2) make_network(sections->size - 1);

    3) parse_net_options(options, &net);

    4) pre_allocate_pinned_memory((size_t)1024 * 1024 * 1024 * 8);   // pre-allocate 8 GB CPU-RAM for pinned memory

    5) parse_convolutional(options, params);

    parse_local(options, params);

    parse_activation(options, params);

    parse_rnn(options, params);

    parse_gru(options, params);

    parse_lstm(options, params);

    parse_conv_lstm(options, params);

    parse_history(options, params);

    parse_crnn(options, params);

    parse_connected(options, params);

    parse_crop(options, params);

    parse_cost(options, params);

    parse_region(options, params);

    parse_yolo(options, params);

    parse_gaussian_yolo(options, params);

    parse_detection(options, params);

    parse_softmax(options, params);

    parse_contrastive(options, params);

    parse_normalization(options, params);

    parse_batchnorm(options, params);

    parse_maxpool(options, params);

    parse_local_avgpool(options, params);

    parse_reorg(options, params);        }

    parse_reorg_old(options, params);

    parse_avgpool(options, params);

    parse_route(options, params);

    parse_upsample(options, params, net);

    parse_shortcut(options, params, net);

    parse_scale_channels(options, params, net);

    parse_sam(options, params, net);

    parse_dropout(options, params);

    cuda_make_array_pinned

    cuda_make_array_pinned_preallocated

    set_specified_workspace_limit

    cuda_make_array:cuda_pull_array_async,activate_array_ongpu

    get_network_output

    CHECK_CUDA   

    Coco依赖的软件:coco.data,yolov4.cfg,yolov4.conv.137,trainvalueno5k.txt,train2014

    read_data_cfg

    option_find_str

    open_valid_file

    cuda_set_device

    parse_network_cfg_custom

    get_labels_custom

    basecfg

    parse_network_cfg

    get_paths(train_images)

    list_to_array(plist)

    get_current_iteration(net)

    draw_train_chart

    load_data

    rand_scale(rand_coef);   

    pthread_join

    float_to_box

    float_to_image

    compute_loss

    train_network_waitkey

    train_networks

    free_data

    resize_network

    validate_detector_map

    save_weights

    draw_train_loss

    sync_nets

     

        release_mat(&img);

        destroy_all_windows_cv();

     

        // free memory

        pthread_join(load_thread, 0);

        free_data(buffer);

     

        free_load_threads(&args);

     

        free(base);

        free(paths);

        free_list_contents(plist);

        free_list(plist);

     

        free_list_contents_kvp(options);

        free_list(options);

     

       free_network;

       free(nets);

       free_network(net_map);

     

    Makefile

    GPU=0

    CUDNN=0

    CUDNN_HALF=0

    OPENCV=0

    AVX=0

    OPENMP=0

    LIBSO=0

    ZED_CAMERA=0

    ZED_CAMERA_v2_8=0

     

    # set GPU=1 and CUDNN=1 to speedup on GPU

    # set CUDNN_HALF=1 to further speedup 3 x times (Mixed-precision on Tensor Cores) GPU: Volta, Xavier, Turing and higher

    # set AVX=1 and OPENMP=1 to speedup on CPU (if error occurs then set AVX=0)

    # set ZED_CAMERA=1 to enable ZED SDK 3.0 and above

    # set ZED_CAMERA_v2_8=1 to enable ZED SDK 2.X

     

    USE_CPP=0

    DEBUG=0

     

    ARCH= -gencode arch=compute_30,code=sm_30

          -gencode arch=compute_35,code=sm_35

          -gencode arch=compute_50,code=[sm_50,compute_50]

          -gencode arch=compute_52,code=[sm_52,compute_52]

            -gencode arch=compute_61,code=[sm_61,compute_61]

     

    OS := $(shell uname)

     

    # Tesla A100 (GA100), DGX-A100, RTX 3080

    # ARCH= -gencode arch=compute_80,code=[sm_80,compute_80]

     

    # Tesla V100

    # ARCH= -gencode arch=compute_70,code=[sm_70,compute_70]

     

    # GeForce RTX 2080 Ti, RTX 2080, RTX 2070, Quadro RTX 8000, Quadro RTX 6000, Quadro RTX 5000, Tesla T4, XNOR Tensor Cores

    # ARCH= -gencode arch=compute_75,code=[sm_75,compute_75]

     

    # Jetson XAVIER

    # ARCH= -gencode arch=compute_72,code=[sm_72,compute_72]

     

    # GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4

    # ARCH= -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61

     

    # GP100/Tesla P100 - DGX-1

    # ARCH= -gencode arch=compute_60,code=sm_60

     

    # For Jetson TX1, Tegra X1, DRIVE CX, DRIVE PX - uncomment:

    # ARCH= -gencode arch=compute_53,code=[sm_53,compute_53]

     

    # For Jetson Tx2 or Drive-PX2 uncomment:

    # ARCH= -gencode arch=compute_62,code=[sm_62,compute_62]

     

     

    VPATH=./src/

    EXEC=darknet

    OBJDIR=./obj/

     

    ifeq ($(LIBSO), 1)

    LIBNAMESO=libdarknet.so

    APPNAMESO=uselib

    endif

     

    ifeq ($(USE_CPP), 1)

    CC=g++

    else

    CC=gcc

    endif

     

    CPP=g++ -std=c++11

    NVCC=nvcc

    OPTS=-Ofast

    LDFLAGS= -lm -pthread

    COMMON= -Iinclude/ -I3rdparty/stb/include

    CFLAGS=-Wall -Wfatal-errors -Wno-unused-result -Wno-unknown-pragmas -fPIC

     

    ifeq ($(DEBUG), 1)

    #OPTS= -O0 -g

    #OPTS= -Og -g

    COMMON+= -DDEBUG

    CFLAGS+= -DDEBUG

    else

    ifeq ($(AVX), 1)

    CFLAGS+= -ffp-contract=fast -mavx -mavx2 -msse3 -msse4.1 -msse4.2 -msse4a

    endif

    endif

     

    CFLAGS+=$(OPTS)

     

    ifneq (,$(findstring MSYS_NT,$(OS)))

    LDFLAGS+=-lws2_32

    endif

     

    ifeq ($(OPENCV), 1)

    COMMON+= -DOPENCV

    CFLAGS+= -DOPENCV

    LDFLAGS+= `pkg-config --libs opencv4 2> /dev/null || pkg-config --libs opencv`

    COMMON+= `pkg-config --cflags opencv4 2> /dev/null || pkg-config --cflags opencv`

    endif

     

    ifeq ($(OPENMP), 1)

        ifeq ($(OS),Darwin) #MAC

            CFLAGS+= -Xpreprocessor -fopenmp

        else

            CFLAGS+= -fopenmp

        endif

    LDFLAGS+= -lgomp

    endif

     

    ifeq ($(GPU), 1)

    COMMON+= -DGPU -I/usr/local/cuda/include/

    CFLAGS+= -DGPU

    ifeq ($(OS),Darwin) #MAC

    LDFLAGS+= -L/usr/local/cuda/lib -lcuda -lcudart -lcublas -lcurand

    else

    LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand

    endif

    endif

     

    ifeq ($(CUDNN), 1)

    COMMON+= -DCUDNN

    ifeq ($(OS),Darwin) #MAC

    CFLAGS+= -DCUDNN -I/usr/local/cuda/include

    LDFLAGS+= -L/usr/local/cuda/lib -lcudnn

    else

    CFLAGS+= -DCUDNN -I/usr/local/cudnn/include

    LDFLAGS+= -L/usr/local/cudnn/lib64 -lcudnn

    endif

    endif

     

    ifeq ($(CUDNN_HALF), 1)

    COMMON+= -DCUDNN_HALF

    CFLAGS+= -DCUDNN_HALF

    ARCH+= -gencode arch=compute_70,code=[sm_70,compute_70]

    endif

     

    ifeq ($(ZED_CAMERA), 1)

    CFLAGS+= -DZED_STEREO -I/usr/local/zed/include

    ifeq ($(ZED_CAMERA_v2_8), 1)

    LDFLAGS+= -L/usr/local/zed/lib -lsl_core -lsl_input -lsl_zed

    #-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0

    else

    LDFLAGS+= -L/usr/local/zed/lib -lsl_zed

    #-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0

    endif

    endif

     

    OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o gaussian_yolo_layer.o upsample_layer.o lstm_layer.o conv_lstm_layer.o scale_channels_layer.o sam_layer.o

    ifeq ($(GPU), 1)

    LDFLAGS+= -lstdc++

    OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o

    endif

     

    OBJS = $(addprefix $(OBJDIR), $(OBJ))

    DEPS = $(wildcard src/*.h) Makefile include/darknet.h

     

    all: $(OBJDIR) backup results setchmod $(EXEC) $(LIBNAMESO) $(APPNAMESO)

     

    ifeq ($(LIBSO), 1)

    CFLAGS+= -fPIC

     

    $(LIBNAMESO): $(OBJDIR) $(OBJS) include/yolo_v2_class.hpp src/yolo_v2_class.cpp

        $(CPP) -shared -std=c++11 -fvisibility=hidden -DLIB_EXPORTS $(COMMON) $(CFLAGS) $(OBJS) src/yolo_v2_class.cpp -o $@ $(LDFLAGS)

     

    $(APPNAMESO): $(LIBNAMESO) include/yolo_v2_class.hpp src/yolo_console_dll.cpp

        $(CPP) -std=c++11 $(COMMON) $(CFLAGS) -o $@ src/yolo_console_dll.cpp $(LDFLAGS) -L ./ -l:$(LIBNAMESO)

    endif

     

    $(EXEC): $(OBJS)

        $(CPP) -std=c++11 $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)

     

    $(OBJDIR)%.o: %.c $(DEPS)

        $(CC) $(COMMON) $(CFLAGS) -c $< -o $@

     

    $(OBJDIR)%.o: %.cpp $(DEPS)

        $(CPP) -std=c++11 $(COMMON) $(CFLAGS) -c $< -o $@

     

    $(OBJDIR)%.o: %.cu $(DEPS)

        $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@

     

    $(OBJDIR):

        mkdir -p $(OBJDIR)

    backup:

        mkdir -p backup

    results:

        mkdir -p results

    setchmod:

        chmod +x *.sh

     

    .PHONY: clean

     

    clean:

        rm -rf $(OBJS) $(EXEC) $(LIBNAMESO) $(APPNAMESO)

     

    SPP结构

     

    人工智能芯片与自动驾驶
  • 相关阅读:
    C++从文件名中去掉后缀
    《深度学习21天实战caffe》_简单读书笔记
    初等变换和阶梯矩阵【】
    A*寻路-2(忘了哪个是最终版的)
    [TWLFramework] 全局委托 全局枚举
    [TWLFramework] Singleton
    [TWLFramework] MessageCenter
    [TWLFramework] Message
    [TWLFramework] UIManager
    [TWLFramework] BasePanel
  • 原文地址:https://www.cnblogs.com/wujianming-110117/p/13845780.html
Copyright © 2011-2022 走看看