环境依赖:
pytorch 0.4以上
tensorboardX: pip install tensorboardX、pip install tensorflow
在项目代码中加入tensorboardX的记录代码,生成文件并返回到浏览器中显示可视化结果。
官方示例:
默认设置是在根目录下生成一个runs文件夹,里面存储summary的信息。
在runs的同级目录下命令行中输入:
tensorboard --logdir runs (不是输tensorboardX)
会出来一个网站,复制到浏览器即可可视化loss,acc,lr等数据的变化过程.
举例说明pytorch中设置summary的方式:
1 import argparse 2 import os 3 import numpy as np 4 from tqdm import tqdm 5 6 from mypath import Path 7 from dataloaders import make_data_loader 8 from modeling.sync_batchnorm.replicate import patch_replication_callback 9 from modeling.deeplab import * 10 from modeling.psp_net import * 11 from utils.loss import SegmentationLosses 12 from utils.calculate_weights import calculate_weigths_labels 13 from utils.lr_scheduler import LR_Scheduler 14 from utils.saver import Saver 15 from utils.summaries import TensorboardSummary 16 from utils.metrics import Evaluator 17 from utils.misc import CrossEntropyLoss2d 18 19 class Trainer(object): 20 def __init__(self, args): 21 self.args = args 22 23 # Define Saver 24 self.saver = Saver(args) 25 self.saver.save_experiment_config() 26 # Define Tensorboard Summary,是pytorch中的tensorboardX. 27 self.summary = TensorboardSummary(self.saver.experiment_dir) 28 self.writer = self.summary.create_summary() 29 30 # Define Dataloader,根据不同的数据集修改此加载器 31 kwargs = {'num_workers': args.workers, 'pin_memory': True} 32 self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) 33 34 # Define network,需要修改的是类的数量. 35 model = PSPNet(num_classes=self.nclass).cuda() 36 #源代码的deeplabv3+模型 37 # model = DeepLab(num_classes=self.nclass, 38 # backbone=args.backbone, 39 # output_stride=args.out_stride, 40 # sync_bn=args.sync_bn, 41 # freeze_bn=args.freeze_bn) 42 43 # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, 44 # {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] 45 46 # Define Optimizer(deeplabv3+) 47 # optimizer = torch.optim.SGD(train_params, momentum=args.momentum, 48 # weight_decay=args.weight_decay, nesterov=args.nesterov) 49 #PSPNET,修改的优化器部分,需要注意的是lr需要用args.lr来表示 50 optimizer = torch.optim.SGD([ 51 {'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'], 52 'lr': 2 * args.lr}, 53 {'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'], 54 'lr': args.lr, 'weight_decay': args.weight_decay} 55 ], momentum=args.momentum, nesterov=True) 56 57 58 59 60 # Define Criterion,在util中有Loss文件对此重新定义,调用时候用self.criterion 61 # whether to use class balanced weights 62 if args.use_balanced_weights: 63 classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy') 64 if os.path.isfile(classes_weights_path): 65 weight = np.load(classes_weights_path) 66 else: 67 weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) 68 weight = torch.from_numpy(weight.astype(np.float32)) 69 else: 70 weight = None 71 self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) 72 self.model, self.optimizer = model, optimizer 73 74 # Define Evaluator 75 self.evaluator = Evaluator(self.nclass) 76 # Define lr scheduler 77 self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, 78 args.epochs, len(self.train_loader)) 79 80 # Using cuda 81 if args.cuda: 82 self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) 83 patch_replication_callback(self.model) 84 self.model = self.model.cuda() 85 86 # Resuming checkpoint 87 self.best_pred = 0.0 88 if args.resume is not None: 89 if not os.path.isfile(args.resume): 90 raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) 91 checkpoint = torch.load(args.resume) 92 args.start_epoch = checkpoint['epoch'] 93 if args.cuda: 94 self.model.module.load_state_dict(checkpoint['state_dict']) 95 else: 96 self.model.load_state_dict(checkpoint['state_dict']) 97 if not args.ft: 98 self.optimizer.load_state_dict(checkpoint['optimizer']) 99 self.best_pred = checkpoint['best_pred'] 100 print("=> loaded checkpoint '{}' (epoch {})" 101 .format(args.resume, checkpoint['epoch'])) 102 103 # Clear start epoch if fine-tuning 104 if args.ft: 105 args.start_epoch = 0 106 #训练函数 107 def training(self, epoch): 108 train_loss = 0.0 109 self.model.train() 110 tbar = tqdm(self.train_loader) 111 num_img_tr = len(self.train_loader) 112 #源代码deeplabv3+的加载方式,换成pspnet时需要进行loss的修改 113 # for inputs_slice, gts_slice in zip(inputs, gts): 114 # inputs_slice = Variable(inputs_slice).cuda() 115 # gts_slice = Variable(gts_slice).cuda() 116 # 117 # optimizer.zero_grad() 118 # outputs, aux = net(inputs_slice) 119 # assert outputs.size()[2:] == gts_slice.size()[1:] 120 # assert outputs.size()[1] == voc.num_classes 121 # 122 # main_loss = criterion(outputs, gts_slice) 123 # aux_loss = criterion(aux, gts_slice) 124 # loss = main_loss + 0.4 * aux_loss 125 # loss.backward() 126 # optimizer.step() 127 # 128 # train_main_loss.update(main_loss.item(), slice_batch_pixel_size) 129 # train_aux_loss.update(aux_loss.item(), slice_batch_pixel_size) 130 for i, sample in enumerate(tbar): 131 image, target = sample['image'], sample['label'] 132 if self.args.cuda: 133 image, target = image.cuda(), target.cuda() 134 self.scheduler(self.optimizer, i, epoch, self.best_pred) 135 136 self.optimizer.zero_grad() 137 outputs, aux = self.model(image)#output即为标签 138 assert outputs.size()[2:] == target.size()[1:] 139 assert outputs.size()[1] == self.nclass 140 loss = self.criterion(outputs, target) 141 #criterion 142 loss.backward() 143 144 #deeplabv3+设置 145 # self.optimizer.zero_grad() 146 # output = self.model(image) 147 # loss = self.criterion(output, target) 148 # loss.backward() 149 self.optimizer.step() 150 train_loss += loss.item() 151 tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) 152 self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) 153 154 # Show 10 * 3 inference results each epoch 155 if i % (num_img_tr // 10) == 0: 156 global_step = i + num_img_tr * epoch 157 self.summary.visualize_image(self.writer, self.args.dataset, image, target, outputs, global_step) 158 159 self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) 160 print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) 161 print('Loss: %.3f' % train_loss) 162 163 if self.args.no_val: 164 # save checkpoint every epoch 165 is_best = False 166 self.saver.save_checkpoint({ 167 'epoch': epoch + 1, 168 'state_dict': self.model.module.state_dict(), 169 'optimizer': self.optimizer.state_dict(), 170 'best_pred': self.best_pred, 171 }, is_best) 172 173 174 def validation(self, epoch): 175 self.model.eval() 176 self.evaluator.reset() 177 tbar = tqdm(self.val_loader, desc=' ') 178 test_loss = 0.0 179 for i, sample in enumerate(tbar): 180 image, target = sample['image'], sample['label'] 181 if self.args.cuda: 182 image, target = image.cuda(), target.cuda() 183 with torch.no_grad(): 184 output = self.model(image) 185 loss = self.criterion(output, target) 186 test_loss += loss.item() 187 tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) 188 pred = output.data.cpu().numpy() 189 target = target.cpu().numpy() 190 pred = np.argmax(pred, axis=1) 191 # Add batch sample into evaluator 192 self.evaluator.add_batch(target, pred) 193 194 # Fast test during the training 195 Acc = self.evaluator.Pixel_Accuracy() 196 Acc_class = self.evaluator.Pixel_Accuracy_Class() 197 mIoU = self.evaluator.Mean_Intersection_over_Union() 198 FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() 199 self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) 200 self.writer.add_scalar('val/mIoU', mIoU, epoch) 201 self.writer.add_scalar('val/Acc', Acc, epoch) 202 self.writer.add_scalar('val/Acc_class', Acc_class, epoch) 203 self.writer.add_scalar('val/fwIoU', FWIoU, epoch) 204 print('Validation:') 205 print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) 206 print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) 207 print('Loss: %.3f' % test_loss) 208 209 new_pred = mIoU 210 if new_pred > self.best_pred: 211 is_best = True 212 self.best_pred = new_pred 213 self.saver.save_checkpoint({ 214 'epoch': epoch + 1, 215 'state_dict': self.model.module.state_dict(), 216 'optimizer': self.optimizer.state_dict(), 217 'best_pred': self.best_pred, 218 }, is_best) 219 220 def main(): 221 # 超参数的设置 222 parser = argparse.ArgumentParser(description="PyTorch DeeplabV3Plus Training") 223 # 提取特征的卷积网络的设置 224 parser.add_argument('--backbone', type=str, default='resnet', 225 choices=['resnet', 'xception', 'drn', 'mobilenet'], 226 help='backbone name (default: resnet)') 227 parser.add_argument('--out-stride', type=int, default=16, 228 help='network output stride (default: 8)') 229 parser.add_argument('--dataset', type=str, default='pascal', 230 choices=['pascal', 'coco', 'cityscapes'], 231 help='dataset name (default: pascal)') 232 parser.add_argument('--use-sbd', action='store_true', default=False, 233 help='whether to use SBD dataset (default: True)') 234 parser.add_argument('--workers', type=int, default=4, 235 metavar='N', help='dataloader threads') 236 parser.add_argument('--base-size', type=int, default=513, 237 help='base image size') 238 # 在cuda内存不足时可修改此参数,原参数为513 239 parser.add_argument('--crop-size', type=int, default=256, 240 help='crop image size') 241 parser.add_argument('--sync-bn', type=bool, default=None, 242 help='whether to use sync bn (default: auto)') 243 parser.add_argument('--freeze-bn', type=bool, default=False, 244 help='whether to freeze bn parameters (default: False)') 245 parser.add_argument('--loss-type', type=str, default='ce', 246 choices=['ce', 'focal'], 247 help='loss func type (default: ce)') 248 # training hyper params 249 parser.add_argument('--epochs', type=int, default=None, metavar='N', 250 help='number of epochs to train (default: auto)') 251 parser.add_argument('--start_epoch', type=int, default=0, 252 metavar='N', help='start epochs (default:0)') 253 parser.add_argument('--batch-size', type=int, default=None, 254 metavar='N', help='input batch size for 255 training (default: auto)') 256 parser.add_argument('--test-batch-size', type=int, default=None, 257 metavar='N', help='input batch size for 258 testing (default: auto)') 259 parser.add_argument('--use-balanced-weights', action='store_true', default=False, 260 help='whether to use balanced weights (default: False)') 261 # optimizer params 262 parser.add_argument('--lr', type=float, default=None, metavar='LR', 263 help='learning rate (default: auto)') 264 parser.add_argument('--lr-scheduler', type=str, default='poly', 265 choices=['poly', 'step', 'cos'], 266 help='lr scheduler mode: (default: poly)') 267 parser.add_argument('--momentum', type=float, default=0.9, 268 metavar='M', help='momentum (default: 0.9)') 269 parser.add_argument('--weight-decay', type=float, default=5e-4, 270 metavar='M', help='w-decay (default: 5e-4)') 271 parser.add_argument('--nesterov', action='store_true', default=False, 272 help='whether use nesterov (default: False)') 273 # cuda, seed and logging 274 parser.add_argument('--no-cuda', action='store_true', default= 275 False, help='disables CUDA training') 276 parser.add_argument('--gpu-ids', type=str, default='0', 277 help='use which gpu to train, must be a 278 comma-separated list of integers only (default=0)') 279 parser.add_argument('--seed', type=int, default=1, metavar='S', 280 help='random seed (default: 1)') 281 # checking point 282 parser.add_argument('--resume', type=str, default=None, 283 help='put the path to resuming file if needed') 284 parser.add_argument('--checkname', type=str, default=None, 285 help='set the checkpoint name') 286 # finetuning pre-trained models 287 parser.add_argument('--ft', action='store_true', default=False, 288 help='finetuning on a different dataset') 289 # evaluation option 290 parser.add_argument('--eval-interval', type=int, default=1, 291 help='evaluuation interval (default: 1)') 292 parser.add_argument('--no-val', action='store_true', default=False, 293 help='skip validation during training') 294 295 args = parser.parse_args() 296 args.cuda = not args.no_cuda and torch.cuda.is_available() 297 if args.cuda: 298 try: 299 args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')] 300 except ValueError: 301 raise ValueError('Argument --gpu_ids must be a comma-separated list of integers only') 302 303 if args.sync_bn is None: 304 if args.cuda and len(args.gpu_ids) > 1: 305 args.sync_bn = True 306 else: 307 args.sync_bn = False 308 309 # 默认的 epochs, batch_size and lr 310 if args.epochs is None: 311 epoches = { 312 'coco': 30, 313 'cityscapes': 200, 314 'pascal': 50, 315 # 50 316 } 317 args.epochs = epoches[args.dataset.lower()] 318 319 if args.batch_size is None: 320 args.batch_size = 2 * len(args.gpu_ids) 321 322 # 4* 323 324 if args.test_batch_size is None: 325 args.test_batch_size = args.batch_size 326 327 if args.lr is None: 328 lrs = { 329 'coco': 0.1, 330 'cityscapes': 0.01, 331 'pascal': 0.007, 332 } 333 args.lr = lrs[args.dataset.lower()] / (2 * len(args.gpu_ids)) * args.batch_size 334 335 336 if args.checkname is None: 337 args.checkname = 'deeplab-'+str(args.backbone) 338 print(args) 339 torch.manual_seed(args.seed) 340 trainer = Trainer(args) 341 print('Starting Epoch:', trainer.args.start_epoch) 342 print('Total Epoches:', trainer.args.epochs) 343 for epoch in range(trainer.args.start_epoch, trainer.args.epochs): 344 trainer.training(epoch) 345 if not trainer.args.no_val and epoch % args.eval_interval == (args.eval_interval - 1): 346 trainer.validation(epoch) 347 348 trainer.writer.close() 349 350 if __name__ == "__main__": 351 main()