在YoLo-V3中使用Darknet53这个网络结构。下图为Darknet-53的网络结构,加入了残差块的结构。
Yolo-V3中的改进:
(1)多尺度计算,Yolo-V3又3个不同特征尺度的输出(使用的是CoCo数据集),分别是13×13×225,26×26×225,52×52×225,这里借鉴了FPN的思想不仅在每个特征图上分别独立做预测,同时通过将小特征图上采样到与大的特征图大小相同,然后与大的特征图拼接做进一步预测。
(2)Yolo-V3代价函数修改,Yolo-v3对类别预测的代价函数进行了修改,没有收用softmax函数,因为原来的分类网络中使用softmax层都是假设一张图片或一个object只属于一个类别,但是在一些复杂的场景下,一个object可能属于多个类,那么在使用softmax可能就会导致漏掉一些类别,所以在Yolo-V3中使用逻辑回归层来对每个类别做二分类,因此当一张图像经过特征提取后的某一类输出如果大于0.5那么就属于这个类。这样一个框就可以预测多个类别。
在Yolo-V3中的维度聚类:
Yolo-V3中使用了k-means聚类计算anchor,聚类的目的是让anchor和邻近的ground truth有更大的IOU,这和anchor的尺寸没有直接的关系。
(1)使用聚类原始数据只有标签框的检测数据集,Yolo-V3都会生成一个包含标注框位置和类别的.txt文件,其中每行都包含(xi,yi,wi,hi)即ground truth相对于原图的坐标。
(2)首先给定k个聚类中心点(wi,hi),这里wi,hi是anchor的宽和高,由于anchor位置不固定,所以没有(x,y)坐标, 只有宽和高。
(3)计算每个标注框和每个聚类中心的距离,d=1-IOU(标注框,聚类中心),这里在计算时将每个标注框的中心点都与聚类中心重合,然后计算IOU,将标注框分配给"距离"最近的聚类中心
(4)所有标注框分配完毕后,对每个族重新计算聚类中心,wi' = 1/Ni∑wi,hi'=1/Ni∑hi,Ni是第i个族的标注框个数,其实就是求该族中所有标注框宽和高的平均值,然后重复3,4步知道聚类中心变化很小。
网络结构(返回3个尺度的输出)
from keras.layers import BatchNormalization from keras.layers.advanced_activations import LeakyReLU from keras.layers import Conv2D,ZeroPadding2D,Add,UpSampling2D,Concatenate from keras.regularizers import l2 def conv(x,*args,**kwargs): new_kwargs = {"kernel_regularizer":l2(5e-4),"use_bias":False} new_kwargs["padding"] = "valid" if kwargs.get("strides")==(2,2) else "same" new_kwargs.update(kwargs) x =Conv2D(*args,**new_kwargs)(x) return x def CBL(x,*args,**kwargs): x = conv(x,*args,**kwargs) x = BatchNormalization()(x) x = LeakyReLU(alpha=0.1)(x) return x def PCBL(x,num_filters): x = ZeroPadding2D(((1,0),(1,0)))(x) x = CBL(x,num_filters,(3,3),strides=(2,2)) return x def CBLR(x,num_filters): y = CBL(x,num_filters,(1,1)) y = CBL(y,num_filters*2,(3,3)) x = Add()([x,y]) return x def CBL5(x,num_filters): x =CBL(x,num_filters,(1,1)) x =CBL(x,num_filters*2,(3,3)) x =CBL(x,num_filters,(1,1)) x =CBL(x,num_filters*2,(3,3)) x =CBL(x,num_filters,(1,1)) return x def CBLC(x,num_filters,out_filters): x =CBL(x,num_filters*2,(3,3)) x =conv(x,out_filters,(1,1)) return x def CBLU(x,num_filters): x = CBL(x,num_filters,(1,1)) x =UpSampling2D(2)(x) return x def body(inputs,num_anchors,num_classes): out=[] x = CBL(inputs,32,(3,3)) n = [1,2,8,8,4] for i in range(5): x = PCBL(x,2**(6+i)) for _ in range(n[i]): x = CBLR(x,2**(5+i)) if i in [2,3,4]: out.append(x) x1 = CBL5(out[2],512) y1 = CBLC(x,512,num_anchors*(num_classes+5)) x = CBLU(x1,256) x = Concatenate()([x,out[1]]) x2 = CBL5(x,256) y2 = CBLC(x2,256,num_anchors*(num_classes+5)) x = CBLU(x2,128) x =Concatenate()([x,out[0]]) x3 = CBL5(x,128) y3 = CBLC(x3,128,num_anchors*(num_classes+5)) return [y3,y2,y1]
从数据集中的xml文件中获取x,y,w,h,label的信息.
import numpy as np from xml.etree.ElementTree import parse class PascalVocXmlParser(object): def __init__(self): pass def get_fname(self,annotation_file): root = self._root_tag(annotation_file) return root.find("filename").text def get_width(self,annotation_file): tree = self._tree(annotation_file) for elem in tree.iter(): print(elem) if "width" in elem.tag: return float(elem.text) def get_height(self,annotation_file): tree = self._tree(annotation_file) for elem in tree.iter(): if "height" in elem.tag: return float(elem.text) def get_labels(self,annotation_file): root = self._root_tag(annotation_file) labels=[] obj_tags =root.findall("object") for t in obj_tags: labels.append(t.find("name").text) return labels def get_boxes(self,annotation_file): root = self._root_tag(annotation_file) bbs=[] obj_tags = root.findall("object") for t in obj_tags: box_tag = t.find("bndbox") x1 = box_tag.find("xmin").text y1 = box_tag.find("ymin").text x2 = box_tag.find("xmax").text y2 = box_tag.find("ymax").text box = np.array([float(x1),float(x2),float(y1),float(y2)]) bbs.append(box) bbs = np.array(bbs) return bbs #获取所有根节点 def _root_tag(self,fname): tree = parse(fname) root = tree.getroot() return root def _tree(self,fname): tree = parse(fname) return tree
根据从xml文件中获得的信息求ytrue
import numpy as np import os from PIL import Image from nets.YoLo_v3_get_xml import PascalVocXmlParser #根据xml文件获取文件名,图片大小,label,box的信息 def get_parse(ann_fname,input_size): parser = PascalVocXmlParser() fname = parser.get_fname(ann_fname) weight = parser.get_width(ann_fname) height = parser.get_height(ann_fname) labels = parser.get_labels(ann_fname) boxes = parser.get_boxes(ann_fname) for i in range(len(boxes)): boxes[i][0] = boxes[i][0]/weight*input_size boxes[i][1] = boxes[i][1]/weight*input_size boxes[i][2] = boxes[i][2]/height*input_size boxes[i][3] = boxes[i][3]/height*input_size return fname,labels,boxes #计算IOU def get_IOU(box1,box2): w_min = min(box1[1],box2[1]) h_min = min(box1[3],box2[3]) w = w_min-box2[0] h = h_min-box1[2] intersect = w*h merge = (box1[1]-box1[0])*(box1[3]-box1[2]) +(box2[1]-box2[0])*(box2[3]-box2[2]) IOU = intersect/(merge-intersect) return IOU #把box和anchor一个点对齐计算IOU #计算anchor和ground truth的最大IOU的位置。 def get_anchor(anchors,box): IOUList = [] anchorslist =np.zeros(((len(anchors)),4),dtype="float32") for i in range(len(anchorslist)): anchorslist[i][0] = box[0] anchorslist[i][1] = anchorslist[i][0] + anchors[i][0] anchorslist[i][2] = box[2] anchorslist[i][3] = anchorslist[i][2] + anchors[i][1] IOU = get_IOU(box,anchorslist[i]) IOUList.append(IOU) anchor =IOUList.index((max(IOUList))) return anchor def get_img(img_dir,fname,input_size): img_fname =os.path.join(img_dir,fname) image = Image.open(img_fname) image = image.resize((input_size,input_size)) image = np.array(image,dtype="float32") image /=255. return image #anchor共有9个,每个尺度3个 def get_ytrue(boxes,anchors,anchor_shape,b,pattern_shape,input_size,classes,labels,ytrues): newbox = np.zeros((4), dtype="float32") for i in range(len(boxes)): #计算出所有anchor与ground truth的最大IOU的index anchor = get_anchor(anchors,boxes[i]) #计算出anchor属于哪个尺度 layer_anchor = anchor//anchor_shape[1] #计算anchor属于该尺度的哪个w,h box_anchor = anchor%anchor_shape[1] rate = pattern_shape[layer_anchor]/input_size cent_x = (boxes[i][0]+boxes[i][1])/2*rate cent_y = (boxes[i][2]+boxes[i][3])/2*rate #向下取整 x = np.floor(cent_x).astype("int32") y = np.floor(cent_y).astype("int32") w = boxes[i][1]-boxes[i][0] h = boxes[i][3]-boxes[i][2] #类别 c = classes.index(labels[i]) newbox[0] = cent_x newbox[1] = cent_y newbox[2] = np.log(max(w,1))/anchors[anchor][0] newbox[3] = np.log(max(h,1))/anchors[anchor][1] #获得ytrue ytrues[layer_anchor][b,x,y,box_anchor,0:4] = newbox[0:4] ytrues[layer_anchor][b,x,y,box_anchor,4] =1 ytrues[layer_anchor][b,x,y,box_anchor,5+c] =1 return ytrues #数据生成器 def generator(batch_size,classes,ann_fnames,img_dir,input_size,anchors): pattern_shape = [52, 26, 13] anchor_shape=[3,3] n = len(ann_fnames) i = 0 while True: inputs = [] ytrues = [np.zeros((batch_size, pattern_shape[l], pattern_shape[l], anchor_shape[1], 5 + len(classes))) for l in range(3)] #构造一个batch_size for b in range(batch_size): if i == 0: np.random.shuffle(ann_fnames) fname, labels, boxes = get_parse(ann_fnames[i], input_size) ytrues = get_ytrue(boxes,anchors,anchor_shape,b,pattern_shape,input_size,classes,labels,ytrues) img = get_img(img_dir, fname, input_size) inputs.append(img) i = (i + 1) % n inputs = np.array(inputs) #返回一个batch_size yield inputs,[ytrues[2],ytrues[1],ytrues[0]]
计算loss
Yolo-V3采用直接位置预测,就是预测边界框中心点相对于对应cell左上角的相对位置偏移,为了将边界框中心点约束在当前cell中,使用sigmoi函数 处理偏移值,这样预测的偏移值在(0,1)范围内。在Faster-RCNN中不加任何限制就会导致不管初始的bbox在图像的什么位置,通过预测偏移量可以将bbox移动到图像任何位置。
loss组成
#计算回归loss def get_loss_box(ytrue,ypre,box_scale,object_mask): xy_delta = box_scale * object_mask * (ypre[...,:2]-ytrue[...,:2]) wh_delta = box_scale * object_mask * (tf.sqrt(ypre[...,2:4])-tf.sqrt(ytrue[...,2:4])) loss_xy = K.sum(K.square(xy_delta),list(range(1,5))) loss_wh = K.sum(K.square(wh_delta),list(range(1,5))) return loss_xy+loss_wh #计算置信度loss def get_loss_con(ytrue,ypre,noobj_scale,object_mask,IOU): object_mask = K.squeeze(object_mask,axis=-1) con_delta = object_mask * (ypre*IOU-ytrue) + noobj_scale * (1-object_mask)*(ypre*IOU-ytrue) loss_con = K.sum(K.square(con_delta),list(range(1,4))) return loss_con #计算类别loss def get_loss_c(ytrue,ypre,object_mask): ytrue = tf.cast(ytrue,tf.int64) loss_class = object_mask*tf.expand_dims(tf.nn.softmax_cross_entropy_with_logits_v2(labels=ytrue,logits=ypre),4) return loss_class def lossCalculator(ytrue,ypre,anchors,batch_size,input_size,box_scale,noobj_scale,ignore_thresh): #ypre从网络中得到的shape=(batch_size,13,13,3*(num_classes+5))这里要转换成(batch_size,13,13,3,num_classes+5) ypre = K.reshape(ypre,shape=[-1, ypre.shape[-3], ypre.shape[-2], anchors.shape[0], ypre.shape[-1] // anchors.shape[0]]) ytrue = K.reshape(ytrue, shape=[-1, ypre.shape[1], ypre.shape[2], ypre.shape[3], ypre.shape[4]]) ytrue,ypre = get_ytrue_ypre(ytrue,ypre,anchors,batch_size) object_mask = K.expand_dims(ytrue[...,4],4) IOU = get_IOU(ytrue[...,:4],ypre[...,:4],input_size) loss_box = get_loss_box(ytrue[...,:4],ypre[...,:4],box_scale,object_mask) loss_con = get_loss_con(ytrue[...,4],ypre[...,4],noobj_scale,object_mask,IOU) loss_class = get_loss_c(ytrue[...,5:],ypre[...,5:],object_mask) losses = loss_box+loss_con+loss_class return tf.reduce_mean(losses) def fn_loss(ytrues,ypres): ignore_thresh =0.5 noobj_scale=0.5 box_scale=1 input_size =416 batch_size =1 anchors = np.array([[[10, 13], [16, 30], [33, 23]], [[30, 61], [62, 45], [59, 119]], [[116, 90], [156, 198], [373, 326]]]) losses=[] loss =lossCalculator(ytrues,ypres,anchors[2-ypres.shape[1]//26],batch_size,input_size,box_scale,noobj_scale,ignore_thresh) losses.append(loss) return tf.sqrt(losses)