作者:欧新宇(Xinyu OU)
本文档所展示的测试结果,均运行于:Intel Core i7-7700K CPU 4.2GHz, nVidia GeForce GTX 1080 Ti
最后更新:2021年2月19日
深度学习的训练包括四种模式:
下面的训练代码中,将同时实现以上四种训练模式。
# 载入项目文件夹
import sys
sys.path.append(r"D:\\Workspace\\MyProjects\\ButterflyClassification")
#################################################################################
import os
import time
import json
from utils.draw import draw_process
from utils.logger import logger
from utils.optimizer import learning_rate_setting, optimizer_setting
from utils.reader import train_reader, trainval_reader, val_reader
from config import *
from eval import *
import paddle
from paddle.static import InputSpec
# 初始配置变量
total_epoch = train_parameters['total_epoch']
# 初始化绘图列表
all_train_iters = []
all_train_losses = []
all_train_accs_top1 = []
all_train_accs_top5 = []
all_test_losses = []
all_test_iters = []
all_test_accs_top1 = []
all_test_accs_top5 = []
def train(model):
# 初始化临时变量
num_batch = 0
best_result = 0
best_result_id = 0
elapsed = 0
# 根据config文件设置训练数据来源
if train_parameters['training_data'] == 'trainval':
data_reader = trainval_reader
elif train_parameters['training_data'] == 'train':
data_reader = train_reader
for epoch in range(1, total_epoch+1):
for batch_id, (image, label) in enumerate(data_reader()):
num_batch += 1
label = paddle.unsqueeze(label, axis=1)
loss, acc = model.train_batch([image], [label])
if num_batch % train_parameters['log_interval'] == 0: # 每10个batch显示一次日志,适合大数据集
avg_loss = loss[0][0]
acc_top1 = acc[0][0]
acc_top5 = acc[0][1]
elapsed_step = time.perf_counter() - elapsed - start
elapsed = time.perf_counter() - start
logger.info('Epoch:{}/{}, batch:{}, train_loss:[{:.5f}], acc_top1:[{:.5f}], acc_top5:[{:.5f}]({:.2f}s)'
.format(epoch, total_epoch, num_batch, loss[0][0], acc[0][0], acc[0][1], elapsed_step))
# 记录训练过程,用于可视化训练过程中的loss和accuracy
all_train_iters.append(num_batch)
all_train_losses.append(avg_loss)
all_train_accs_top1.append(acc_top1)
all_train_accs_top5.append(acc_top5)
# 每隔一定周期进行一次测试
if epoch % train_parameters['eval_interval'] == 0 or epoch == total_epoch:
# 模型校验
avg_loss, avg_acc_top1, avg_acc_top5 = eval(model, val_reader())
logger.info('[validation] Epoch:{}/{}, val_loss:[{:.5f}], val_top1:[{:.5f}], val_top5:[{:.5f}]'.format(epoch, total_epoch, avg_loss, avg_acc_top1, avg_acc_top5))
# 记录测试过程,用于可视化训练过程中的loss和accuracy
all_test_iters.append(epoch)
all_test_losses.append(avg_loss)
all_test_accs_top1.append(avg_acc_top1)
all_test_accs_top5.append(avg_acc_top5)
# 将性能最好的模型保存为final模型
if avg_acc_top1 > best_result:
best_result = avg_acc_top1
best_result_id = epoch
# finetune model 用于调优和恢复训练
model.save(os.path.join(checkpoint_path, model_name + '_final'))
# inference model 用于部署和预测
model.save(os.path.join(final_models_path, model_name + '_final'), training=False)
logger.info('已保存当前测试模型(epoch={})为最优模型:{}_final'.format(best_result_id, model_name))
logger.info('最优top1测试精度:{:.5f} (epoch={})'.format(best_result, best_result_id))
# 根据需要决定是否需要将每次测试的模型都进行保存(if needed),保存模型需要耗费一定的运算时间和大量的存储资源
# 建议在训练大型模型时,开启该选项,方便训练中断时能够及时恢复训练
# 训练小型模型(训练时间短)时,可以关闭该选项,以进一步提高训练速度
if train_parameters['checkpointed']:
model.save(os.path.join(checkpoint_path, model_name + '_' + str(epoch)))
logger.info('训练完成,最终性能accuracy={:.5f}(epoch={}), 总耗时{:.2f}s, 已将其保存为:{}_final'.format(best_result, best_result_id, time.perf_counter() - start, model_name))
if __name__ == '__main__':
# 将此次训练的超参数进行保存
data = json.dumps(train_parameters, indent=4, ensure_ascii=False, sort_keys=False, separators=(',', ':')) # 格式化字典格式的参数列表
logger.info(data)
# 启动训练过程
logger.info('训练参数保存完毕,使用{}模型, 训练{}数据, 训练集{}, 启动训练...'.format(train_parameters['architecture'],train_parameters['dataset_name'],train_parameters['training_data']))
logger.info('当前模型目录为:{}'.format(model_name + '_' + train_parameters['starting_time']))
# 设置输入样本的维度
input_spec = InputSpec(shape=[None] + train_parameters['input_size'], dtype='float32', name='image')
label_spec = InputSpec(shape=[None, 1], dtype='int64', name='label')
# 载入官方标准模型,若不存在则会自动进行下载,pretrained=True|False控制是否使用Imagenet预训练参数
network = paddle.vision.models.resnet50(num_classes=train_parameters['class_dim'], pretrained=train_parameters['pretrained'])
model = paddle.Model(network, input_spec, label_spec)
logger.info('模型参数信息:')
logger.info(model.summary()) # 是否显示神经网络的具体信息
if train_parameters['checkpoint_train'] == True:
model.load(checkpoint_load_model)
logger.info('载入{}中断模型和参数完毕,开始从checkpoint恢复训练'.format(train_parameters['architecture']))
logger.info('checkpoint模型:{}'.format(checkpoint_load_model))
else:
if train_parameters['pretrained'] == False:
logger.info('载入{}模型完毕,从初始状态开始训练'.format(train_parameters['architecture']))
elif train_parameters['pretrained_model'] == 'API':
logger.info('载入Imagenet-{}预训练模型完毕,开始微调训练(fine-tune)'.format(train_parameters['architecture']))
else:
model.load(project_pretrained_model)
logger.info('载入自定义预训练{}模型完毕,开始微调训练(fine-tune)'.format(train_parameters['architecture']))
logger.info('预训练模型:{}'.format(project_pretrained_model))
# 设置学习率、优化器、损失函数和评价指标
lr = learning_rate_setting()
optimizer = optimizer_setting(model, lr)
model.prepare(optimizer,
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy(topk=(1,5)))
# 启动训练过程
start = time.perf_counter()
train(model)
logger.info('训练完毕,结果路径{}.'.format(result_root_path))
# 输出训练过程图
logger.info('Done.')
draw_process("Training Process", 'Train Loss', 'Train Accuracy(top1)', all_train_iters, all_train_losses, all_train_accs_top1, 'train')
draw_process("Validation Results", 'Validation Loss', 'Validation Accuracy(top1)', all_test_iters, all_test_losses, all_test_accs_top1, 'val')
2021-03-06 19:40:59,114 - INFO: { "dataset_name":"Butterfly", "architecture":"Mobilenetv2_without_pretrained", "training_data":"train", "starting_time":"202103061920", "input_size":[ 3, 224, 224 ], "mean_value":[ 0.485, 0.456, 0.406 ], "std_value":[ 0.229, 0.224, 0.225 ], "num_trainval":490, "num_train":423, "num_val":67, "num_test":129, "class_dim":7, "label_dict":{ "0":"admiral", "1":"black_swallowtail", "2":"machaon", "3":"monarch_closed", "4":"monarch_open", "5":"peacock", "6":"zebra" }, "total_epoch":2, "batch_size":64, "log_interval":1, "eval_interval":1, "checkpointed":false, "checkpoint_train":false, "checkpoint_model":"Butterfly_Mobilenetv2_pretrained_final", "checkpoint_time":"202102182058", "pretrained":true, "pretrained_model":"API", "dataset_root_path":"D:\\Workspace\\ExpDatasets", "result_root_path":"D:\\Workspace\\ExpResults", "project_result_path":"D:\\Workspace\\MyProjects\\ButterflyClassification\\results", "useGPU":true, "learning_strategy":{ "optimizer_strategy":"Momentum", "learning_rate_strategy":"CosineAnnealingDecay", "learning_rate":0.01, "momentum":0.9, "Piecewise_boundaries":[ 60, 80, 90 ], "Piecewise_values":[ 0.01, 0.001, 0.0001, 1e-05 ], "Exponential_gamma":0.9, "Polynomial_decay_steps":10, "verbose":false }, "augmentation_strategy":{ "withAugmentation":true, "augmentation_prob":0.5, "rotate_angle":15, "Hflip_prob":0.5, "brightness":0.4, "contrast":0.4, "saturation":0.4, "hue":0.4 } } 2021-03-06 19:40:59,115 - INFO: 训练参数保存完毕,使用Mobilenetv2_without_pretrained模型, 训练Butterfly数据, 训练集train, 启动训练... 2021-03-06 19:40:59,115 - INFO: 当前模型目录为:Butterfly_Mobilenetv2_without_pretrained_202103061920 2021-03-06 19:40:59,190 - INFO: unique_endpoints {''} 2021-03-06 19:40:59,192 - INFO: File C:\Users\Administrator/.cache/paddle/hapi/weights\resnet50.pdparams md5 checking... 2021-03-06 19:40:59,497 - INFO: Found C:\Users\Administrator/.cache/paddle/hapi/weights\resnet50.pdparams C:\Users\Administrator\anaconda3\lib\site-packages\paddle\fluid\dygraph\layers.py:1263: UserWarning: Skip loading for fc.weight. fc.weight receives a shape [2048, 1000], but the expected shape is [2048, 7]. warnings.warn(("Skip loading for {}. ".format(key) + str(err))) C:\Users\Administrator\anaconda3\lib\site-packages\paddle\fluid\dygraph\layers.py:1263: UserWarning: Skip loading for fc.bias. fc.bias receives a shape [1000], but the expected shape is [7]. warnings.warn(("Skip loading for {}. ".format(key) + str(err))) 2021-03-06 19:41:00,184 - INFO: 模型参数信息: 2021-03-06 19:41:00,218 - INFO: {'total_params': 23575495, 'trainable_params': 23469255} 2021-03-06 19:41:00,219 - INFO: 载入Imagenet-Mobilenetv2_without_pretrained预训练模型完毕,开始微调训练(fine-tune)
------------------------------------------------------------------------------- Layer (type) Input Shape Output Shape Param # =============================================================================== Conv2D-262 [[1, 3, 224, 224]] [1, 64, 112, 112] 9,408 BatchNorm2D-262 [[1, 64, 112, 112]] [1, 64, 112, 112] 256 ReLU-18 [[1, 64, 112, 112]] [1, 64, 112, 112] 0 MaxPool2D-2 [[1, 64, 112, 112]] [1, 64, 56, 56] 0 Conv2D-264 [[1, 64, 56, 56]] [1, 64, 56, 56] 4,096 BatchNorm2D-264 [[1, 64, 56, 56]] [1, 64, 56, 56] 256 ReLU-19 [[1, 256, 56, 56]] [1, 256, 56, 56] 0 Conv2D-265 [[1, 64, 56, 56]] [1, 64, 56, 56] 36,864 BatchNorm2D-265 [[1, 64, 56, 56]] [1, 64, 56, 56] 256 Conv2D-266 [[1, 64, 56, 56]] [1, 256, 56, 56] 16,384 BatchNorm2D-266 [[1, 256, 56, 56]] [1, 256, 56, 56] 1,024 Conv2D-263 [[1, 64, 56, 56]] [1, 256, 56, 56] 16,384 BatchNorm2D-263 [[1, 256, 56, 56]] [1, 256, 56, 56] 1,024 BottleneckBlock-17 [[1, 64, 56, 56]] [1, 256, 56, 56] 0 Conv2D-267 [[1, 256, 56, 56]] [1, 64, 56, 56] 16,384 BatchNorm2D-267 [[1, 64, 56, 56]] [1, 64, 56, 56] 256 ReLU-20 [[1, 256, 56, 56]] [1, 256, 56, 56] 0 Conv2D-268 [[1, 64, 56, 56]] [1, 64, 56, 56] 36,864 BatchNorm2D-268 [[1, 64, 56, 56]] [1, 64, 56, 56] 256 Conv2D-269 [[1, 64, 56, 56]] [1, 256, 56, 56] 16,384 BatchNorm2D-269 [[1, 256, 56, 56]] [1, 256, 56, 56] 1,024 BottleneckBlock-18 [[1, 256, 56, 56]] [1, 256, 56, 56] 0 Conv2D-270 [[1, 256, 56, 56]] [1, 64, 56, 56] 16,384 BatchNorm2D-270 [[1, 64, 56, 56]] [1, 64, 56, 56] 256 ReLU-21 [[1, 256, 56, 56]] [1, 256, 56, 56] 0 Conv2D-271 [[1, 64, 56, 56]] [1, 64, 56, 56] 36,864 BatchNorm2D-271 [[1, 64, 56, 56]] [1, 64, 56, 56] 256 Conv2D-272 [[1, 64, 56, 56]] [1, 256, 56, 56] 16,384 BatchNorm2D-272 [[1, 256, 56, 56]] [1, 256, 56, 56] 1,024 BottleneckBlock-19 [[1, 256, 56, 56]] [1, 256, 56, 56] 0 Conv2D-274 [[1, 256, 56, 56]] [1, 128, 56, 56] 32,768 BatchNorm2D-274 [[1, 128, 56, 56]] [1, 128, 56, 56] 512 ReLU-22 [[1, 512, 28, 28]] [1, 512, 28, 28] 0 Conv2D-275 [[1, 128, 56, 56]] [1, 128, 28, 28] 147,456 BatchNorm2D-275 [[1, 128, 28, 28]] [1, 128, 28, 28] 512 Conv2D-276 [[1, 128, 28, 28]] [1, 512, 28, 28] 65,536 BatchNorm2D-276 [[1, 512, 28, 28]] [1, 512, 28, 28] 2,048 Conv2D-273 [[1, 256, 56, 56]] [1, 512, 28, 28] 131,072 BatchNorm2D-273 [[1, 512, 28, 28]] [1, 512, 28, 28] 2,048 BottleneckBlock-20 [[1, 256, 56, 56]] [1, 512, 28, 28] 0 Conv2D-277 [[1, 512, 28, 28]] [1, 128, 28, 28] 65,536 BatchNorm2D-277 [[1, 128, 28, 28]] [1, 128, 28, 28] 512 ReLU-23 [[1, 512, 28, 28]] [1, 512, 28, 28] 0 Conv2D-278 [[1, 128, 28, 28]] [1, 128, 28, 28] 147,456 BatchNorm2D-278 [[1, 128, 28, 28]] [1, 128, 28, 28] 512 Conv2D-279 [[1, 128, 28, 28]] [1, 512, 28, 28] 65,536 BatchNorm2D-279 [[1, 512, 28, 28]] [1, 512, 28, 28] 2,048 BottleneckBlock-21 [[1, 512, 28, 28]] [1, 512, 28, 28] 0 Conv2D-280 [[1, 512, 28, 28]] [1, 128, 28, 28] 65,536 BatchNorm2D-280 [[1, 128, 28, 28]] [1, 128, 28, 28] 512 ReLU-24 [[1, 512, 28, 28]] [1, 512, 28, 28] 0 Conv2D-281 [[1, 128, 28, 28]] [1, 128, 28, 28] 147,456 BatchNorm2D-281 [[1, 128, 28, 28]] [1, 128, 28, 28] 512 Conv2D-282 [[1, 128, 28, 28]] [1, 512, 28, 28] 65,536 BatchNorm2D-282 [[1, 512, 28, 28]] [1, 512, 28, 28] 2,048 BottleneckBlock-22 [[1, 512, 28, 28]] [1, 512, 28, 28] 0 Conv2D-283 [[1, 512, 28, 28]] [1, 128, 28, 28] 65,536 BatchNorm2D-283 [[1, 128, 28, 28]] [1, 128, 28, 28] 512 ReLU-25 [[1, 512, 28, 28]] [1, 512, 28, 28] 0 Conv2D-284 [[1, 128, 28, 28]] [1, 128, 28, 28] 147,456 BatchNorm2D-284 [[1, 128, 28, 28]] [1, 128, 28, 28] 512 Conv2D-285 [[1, 128, 28, 28]] [1, 512, 28, 28] 65,536 BatchNorm2D-285 [[1, 512, 28, 28]] [1, 512, 28, 28] 2,048 BottleneckBlock-23 [[1, 512, 28, 28]] [1, 512, 28, 28] 0 Conv2D-287 [[1, 512, 28, 28]] [1, 256, 28, 28] 131,072 BatchNorm2D-287 [[1, 256, 28, 28]] [1, 256, 28, 28] 1,024 ReLU-26 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-288 [[1, 256, 28, 28]] [1, 256, 14, 14] 589,824 BatchNorm2D-288 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 Conv2D-289 [[1, 256, 14, 14]] [1, 1024, 14, 14] 262,144 BatchNorm2D-289 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 4,096 Conv2D-286 [[1, 512, 28, 28]] [1, 1024, 14, 14] 524,288 BatchNorm2D-286 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 4,096 BottleneckBlock-24 [[1, 512, 28, 28]] [1, 1024, 14, 14] 0 Conv2D-290 [[1, 1024, 14, 14]] [1, 256, 14, 14] 262,144 BatchNorm2D-290 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 ReLU-27 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-291 [[1, 256, 14, 14]] [1, 256, 14, 14] 589,824 BatchNorm2D-291 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 Conv2D-292 [[1, 256, 14, 14]] [1, 1024, 14, 14] 262,144 BatchNorm2D-292 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 4,096 BottleneckBlock-25 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-293 [[1, 1024, 14, 14]] [1, 256, 14, 14] 262,144 BatchNorm2D-293 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 ReLU-28 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-294 [[1, 256, 14, 14]] [1, 256, 14, 14] 589,824 BatchNorm2D-294 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 Conv2D-295 [[1, 256, 14, 14]] [1, 1024, 14, 14] 262,144 BatchNorm2D-295 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 4,096 BottleneckBlock-26 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-296 [[1, 1024, 14, 14]] [1, 256, 14, 14] 262,144 BatchNorm2D-296 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 ReLU-29 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-297 [[1, 256, 14, 14]] [1, 256, 14, 14] 589,824 BatchNorm2D-297 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 Conv2D-298 [[1, 256, 14, 14]] [1, 1024, 14, 14] 262,144 BatchNorm2D-298 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 4,096 BottleneckBlock-27 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-299 [[1, 1024, 14, 14]] [1, 256, 14, 14] 262,144 BatchNorm2D-299 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 ReLU-30 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-300 [[1, 256, 14, 14]] [1, 256, 14, 14] 589,824 BatchNorm2D-300 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 Conv2D-301 [[1, 256, 14, 14]] [1, 1024, 14, 14] 262,144 BatchNorm2D-301 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 4,096 BottleneckBlock-28 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-302 [[1, 1024, 14, 14]] [1, 256, 14, 14] 262,144 BatchNorm2D-302 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 ReLU-31 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-303 [[1, 256, 14, 14]] [1, 256, 14, 14] 589,824 BatchNorm2D-303 [[1, 256, 14, 14]] [1, 256, 14, 14] 1,024 Conv2D-304 [[1, 256, 14, 14]] [1, 1024, 14, 14] 262,144 BatchNorm2D-304 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 4,096 BottleneckBlock-29 [[1, 1024, 14, 14]] [1, 1024, 14, 14] 0 Conv2D-306 [[1, 1024, 14, 14]] [1, 512, 14, 14] 524,288 BatchNorm2D-306 [[1, 512, 14, 14]] [1, 512, 14, 14] 2,048 ReLU-32 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 0 Conv2D-307 [[1, 512, 14, 14]] [1, 512, 7, 7] 2,359,296 BatchNorm2D-307 [[1, 512, 7, 7]] [1, 512, 7, 7] 2,048 Conv2D-308 [[1, 512, 7, 7]] [1, 2048, 7, 7] 1,048,576 BatchNorm2D-308 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 8,192 Conv2D-305 [[1, 1024, 14, 14]] [1, 2048, 7, 7] 2,097,152 BatchNorm2D-305 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 8,192 BottleneckBlock-30 [[1, 1024, 14, 14]] [1, 2048, 7, 7] 0 Conv2D-309 [[1, 2048, 7, 7]] [1, 512, 7, 7] 1,048,576 BatchNorm2D-309 [[1, 512, 7, 7]] [1, 512, 7, 7] 2,048 ReLU-33 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 0 Conv2D-310 [[1, 512, 7, 7]] [1, 512, 7, 7] 2,359,296 BatchNorm2D-310 [[1, 512, 7, 7]] [1, 512, 7, 7] 2,048 Conv2D-311 [[1, 512, 7, 7]] [1, 2048, 7, 7] 1,048,576 BatchNorm2D-311 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 8,192 BottleneckBlock-31 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 0 Conv2D-312 [[1, 2048, 7, 7]] [1, 512, 7, 7] 1,048,576 BatchNorm2D-312 [[1, 512, 7, 7]] [1, 512, 7, 7] 2,048 ReLU-34 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 0 Conv2D-313 [[1, 512, 7, 7]] [1, 512, 7, 7] 2,359,296 BatchNorm2D-313 [[1, 512, 7, 7]] [1, 512, 7, 7] 2,048 Conv2D-314 [[1, 512, 7, 7]] [1, 2048, 7, 7] 1,048,576 BatchNorm2D-314 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 8,192 BottleneckBlock-32 [[1, 2048, 7, 7]] [1, 2048, 7, 7] 0 AdaptiveAvgPool2D-6 [[1, 2048, 7, 7]] [1, 2048, 1, 1] 0 Linear-6 [[1, 2048]] [1, 7] 14,343 =============================================================================== Total params: 23,575,495 Trainable params: 23,469,255 Non-trainable params: 106,240 ------------------------------------------------------------------------------- Input size (MB): 0.57 Forward/backward pass size (MB): 261.48 Params size (MB): 89.93 Estimated Total Size (MB): 351.98 -------------------------------------------------------------------------------
C:\Users\Administrator\anaconda3\lib\site-packages\paddle\nn\layer\norm.py:635: UserWarning: When training, we now always track global mean and variance. warnings.warn( 2021-03-06 19:41:01,512 - INFO: Epoch:1/2, batch:1, train_loss:[2.08230], acc_top1:[0.18750], acc_top5:[0.79688](1.27s) 2021-03-06 19:41:02,223 - INFO: Epoch:1/2, batch:2, train_loss:[1.78195], acc_top1:[0.40625], acc_top5:[0.84375](0.71s) 2021-03-06 19:41:03,030 - INFO: Epoch:1/2, batch:3, train_loss:[1.75363], acc_top1:[0.25000], acc_top5:[0.84375](0.81s) 2021-03-06 19:41:03,836 - INFO: Epoch:1/2, batch:4, train_loss:[1.86290], acc_top1:[0.28125], acc_top5:[0.84375](0.81s) 2021-03-06 19:41:04,613 - INFO: Epoch:1/2, batch:5, train_loss:[1.40329], acc_top1:[0.48438], acc_top5:[0.93750](0.78s) 2021-03-06 19:41:05,378 - INFO: Epoch:1/2, batch:6, train_loss:[1.26327], acc_top1:[0.56250], acc_top5:[0.90625](0.76s) 2021-03-06 19:41:05,765 - INFO: [validation] Epoch:1/2, val_loss:[0.02363], val_top1:[0.53731], val_top5:[0.94030] C:\Users\Administrator\anaconda3\lib\site-packages\paddle\fluid\layers\math_op_patch.py:293: UserWarning: C:\Users\Administrator\anaconda3\lib\site-packages\paddle\vision\models\resnet.py:145 The behavior of expression A + B has been unified with elementwise_add(X, Y, axis=-1) from Paddle 2.0. If your code works well in the older versions but crashes in this version, try to use elementwise_add(X, Y, axis=0) instead of A + B. This transitional warning will be dropped in the future. warnings.warn( 2021-03-06 19:41:09,185 - INFO: 已保存当前测试模型(epoch=1)为最优模型:Butterfly_Mobilenetv2_without_pretrained_final 2021-03-06 19:41:09,186 - INFO: 最优top1测试精度:0.53731 (epoch=1) 2021-03-06 19:41:10,063 - INFO: Epoch:2/2, batch:7, train_loss:[1.08590], acc_top1:[0.68750], acc_top5:[0.98438](4.69s) 2021-03-06 19:41:10,886 - INFO: Epoch:2/2, batch:8, train_loss:[0.86281], acc_top1:[0.73438], acc_top5:[0.96875](0.82s) 2021-03-06 19:41:11,659 - INFO: Epoch:2/2, batch:9, train_loss:[0.94885], acc_top1:[0.68750], acc_top5:[0.95312](0.77s) 2021-03-06 19:41:12,452 - INFO: Epoch:2/2, batch:10, train_loss:[0.55992], acc_top1:[0.85938], acc_top5:[1.00000](0.79s) 2021-03-06 19:41:13,242 - INFO: Epoch:2/2, batch:11, train_loss:[0.47196], acc_top1:[0.85938], acc_top5:[0.98438](0.79s) 2021-03-06 19:41:14,046 - INFO: Epoch:2/2, batch:12, train_loss:[0.55990], acc_top1:[0.79688], acc_top5:[0.95312](0.80s) 2021-03-06 19:41:14,417 - INFO: [validation] Epoch:2/2, val_loss:[0.00819], val_top1:[0.79104], val_top5:[0.98507] 2021-03-06 19:41:17,910 - INFO: 已保存当前测试模型(epoch=2)为最优模型:Butterfly_Mobilenetv2_without_pretrained_final 2021-03-06 19:41:17,911 - INFO: 最优top1测试精度:0.79104 (epoch=2) 2021-03-06 19:41:17,912 - INFO: 训练完成,最终性能accuracy=0.79104(epoch=2), 总耗时17.67s, 已将其保存为:Butterfly_Mobilenetv2_without_pretrained_final 2021-03-06 19:41:17,912 - INFO: 训练完毕,结果路径D:\Workspace\ExpResults\Butterfly_Mobilenetv2_without_pretrained_202103061920. 2021-03-06 19:41:17,913 - INFO: Done.
使用高层API接口进行训练代码简单,而且容易理解。但因为代码被封装,因此对于过程的分析不如标准框架清晰,需要借助于其他第三方接口,例如Paddle DL。
import os
import time
import json
from config import *
from utils.optimizer import learning_rate_setting, optimizer_setting
from utils.logger import logger
from utils.reader import dataset_train, dataset_trainval, dataset_val
import paddle
from paddle.static import InputSpec
def train(model):
model.fit(train_data = dataset_train,
eval_data = dataset_val,
epochs = train_parameters['total_epoch'],
batch_size = train_parameters['batch_size'],
log_freq = train_parameters['log_interval'],
eval_freq = train_parameters['eval_interval'],
# save_dir = checkpoint_path, # checkpoint model用于恢复训练
save_freq = train_parameters['total_epoch'],
shuffle = True,
verbose = 1) # 1 折叠每个epoch的log,2打开每个epoch的log
# finetune model 用于调优和恢复训练
model.save(os.path.join(checkpoint_path, model_name + '_final'))
# inference model 用于部署和预测
model.save(os.path.join(final_models_path, model_name + '_final'), training=False)
if __name__ == '__main__':
# 将此次训练的超参数进行保存
data = json.dumps(train_parameters, indent=4, ensure_ascii=False, sort_keys=False, separators=(',', ':')) # 格式化字典格式的参数列表
logger.info(data)
# 启动训练过程
logger.info('训练参数保存完毕,使用{}模型, 训练{}数据, 训练集{}, 启动训练...'.format(train_parameters['architecture'],train_parameters['dataset_name'],train_parameters['training_data']))
logger.info('当前模型目录为:{}'.format(model_name + '_' + train_parameters['starting_time']))
# 设置输入样本的维度
input_spec = InputSpec(shape=[None] + train_parameters['input_size'], dtype='float32', name='image')
label_spec = InputSpec(shape=[None, 1], dtype='int64', name='label')
# 载入官方标准模型,若不存在则会自动进行下载,pretrained=True|False控制是否使用Imagenet预训练参数
network = paddle.vision.models.resnet50(num_classes=train_parameters['class_dim'], pretrained=train_parameters['pretrained'])
model = paddle.Model(network, input_spec, label_spec)
logger.info('模型参数信息:')
# logger.info(model.summary()) # 是否显示神经网络的具体信息
if train_parameters['checkpoint_train'] == True:
model.load(checkpoint_load_model)
logger.info('载入{}中断模型和参数完毕,开始从checkpoint恢复训练'.format(train_parameters['architecture']))
logger.info('checkpoint模型:{}'.format(checkpoint_load_model))
else:
if train_parameters['pretrained'] == False:
logger.info('载入{}模型完毕,从初始状态开始训练'.format(train_parameters['architecture']))
elif train_parameters['pretrained_model'] == 'API':
logger.info('载入Imagenet-{}预训练模型完毕,开始微调训练(fine-tune)'.format(train_parameters['architecture']))
else:
model.load(project_pretrained_model)
logger.info('载入自定义预训练{}模型完毕,开始微调训练(fine-tune)'.format(train_parameters['architecture']))
logger.info('预训练模型:{}'.format(project_pretrained_model))
# 设置学习率、优化器、损失函数和评价指标
lr = learning_rate_setting()
optimizer = optimizer_setting(model, lr)
model.prepare(optimizer,
paddle.nn.CrossEntropyLoss(),
paddle.metric.Accuracy(topk=(1,5)))
# 启动训练过程
start = time.perf_counter()
train(model)
logger.info('训练完毕,结果路径{}.'.format(result_root_path))
print('Done.')
C:\Users\Administrator\anaconda3\lib\site-packages\ipykernel\ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. and should_run_async(code) 2021-03-06 19:16:07,193 - INFO: { "dataset_name":"Butterfly", "architecture":"Mobilenetv2_without", "training_data":"train", "starting_time":"202103061915", "input_size":[ 3, 224, 224 ], "mean_value":[ 0.485, 0.456, 0.406 ], "std_value":[ 0.229, 0.224, 0.225 ], "num_trainval":490, "num_train":423, "num_val":67, "num_test":129, "class_dim":7, "label_dict":{ "0":"admiral", "1":"black_swallowtail", "2":"machaon", "3":"monarch_closed", "4":"monarch_open", "5":"peacock", "6":"zebra" }, "total_epoch":2, "batch_size":64, "log_interval":1, "eval_interval":1, "checkpointed":false, "checkpoint_train":false, "checkpoint_model":"Butterfly_Mobilenetv2_pretrained_final", "checkpoint_time":"202102182058", "pretrained":true, "pretrained_model":"API", "dataset_root_path":"D:\\Workspace\\ExpDatasets", "result_root_path":"D:\\Workspace\\ExpResults", "project_result_path":"D:\\Workspace\\MyProjects\\ButterflyClassification\\results", "useGPU":true, "learning_strategy":{ "optimizer_strategy":"Momentum", "learning_rate_strategy":"CosineAnnealingDecay", "learning_rate":0.01, "momentum":0.9, "Piecewise_boundaries":[ 60, 80, 90 ], "Piecewise_values":[ 0.01, 0.001, 0.0001, 1e-05 ], "Exponential_gamma":0.9, "Polynomial_decay_steps":10, "verbose":false }, "augmentation_strategy":{ "withAugmentation":true, "augmentation_prob":0.5, "rotate_angle":15, "Hflip_prob":0.5, "brightness":0.4, "contrast":0.4, "saturation":0.4, "hue":0.4 } } 2021-03-06 19:16:07,194 - INFO: 训练参数保存完毕,使用Mobilenetv2_without模型, 训练Butterfly数据, 训练集train, 启动训练... 2021-03-06 19:16:07,195 - INFO: 当前模型目录为:Butterfly_Mobilenetv2_without_202103061915 2021-03-06 19:16:07,272 - INFO: unique_endpoints {''} 2021-03-06 19:16:07,273 - INFO: File C:\Users\Administrator/.cache/paddle/hapi/weights\resnet50.pdparams md5 checking... 2021-03-06 19:16:07,566 - INFO: Found C:\Users\Administrator/.cache/paddle/hapi/weights\resnet50.pdparams 2021-03-06 19:16:08,269 - INFO: 模型参数信息: 2021-03-06 19:16:08,269 - INFO: 载入Imagenet-Mobilenetv2_without预训练模型完毕,开始微调训练(fine-tune)
The loss value printed in the log is the current step, and the metric is the average value of previous step. Epoch 1/2 step 7/7 [==============================] - loss: 1.0073 - acc_top1: 0.5485 - acc_top5: 0.9007 - 632ms/step Eval begin... The loss value printed in the log is the current batch, and the metric is the average value of previous step. step 2/2 [==============================] - loss: 0.8268 - acc_top1: 0.8507 - acc_top5: 0.9254 - 198ms/step Eval samples: 67 Epoch 2/2 step 7/7 [==============================] - loss: 0.1625 - acc_top1: 0.9078 - acc_top5: 0.9858 - 564ms/step Eval begin... The loss value printed in the log is the current batch, and the metric is the average value of previous step. step 2/2 [==============================] - loss: 5.4584e-04 - acc_top1: 0.9104 - acc_top5: 1.0000 - 202ms/step Eval samples: 67
2021-03-06 19:16:21,244 - INFO: 训练完毕,结果路径D:\Workspace\ExpResults\Butterfly_Mobilenetv2_without_202103061915.
Done.
from utils.reader import dataset_val
result = model.evaluate(dataset_val, batch_size=args['batch_size'],verbose=0)
print('验证集top1准确率:{:.5f}, top5准确率:{:.5f}'.format(result['acc_top1'], result['acc_top5']))
验证集top1准确率:0.91045, top5准确率:1.00000
from utils.reader import dataset_test
result = model.evaluate(dataset_test, batch_size=args['batch_size'],verbose=0)
print('验证集top1准确率:{:.5f}, top5准确率:{:.5f}'.format(result['acc_top1'], result['acc_top5']))
验证集top1准确率:0.90698, top5准确率:1.00000
使用paddle的高层API接口model.predict进行预测。该方法比较代码比较简单,但通常进行预测的时候都是使用部署模型进行评估,因此建议参考三、测试集评估
部分,并使用部署模型进行。
import numpy as np
import paddle.nn.functional as F
logits = model.predict(dataset_test, batch_size=args['batch_size'], stack_outputs=True)
pred = F.softmax(paddle.to_tensor(logits[0]))
pred_id = np.argmax(pred.numpy(), axis=1)
pred_id
Predict begin... step 3/3 [==============================] - 253ms/step Predict samples: 129
array([0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 4, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], dtype=int64)
# 载入项目文件夹
import sys
sys.path.append(r"D:\\Workspace\\MyProjects\\ButterflyClassification")
#################################################################################
import numpy as np
import paddle
from config import train_parameters as args
from config import project_checkpoint_path
from utils.reader import val_reader, test_reader
import paddle.nn.functional as F
from paddle.static import InputSpec
__all__ = ['eval']
def eval(model, data_reader, verbose=0):
if verbose == 1:
print('开始评估...损失和精度均在上一个batch进行评估')
accuracies_top1 = []
accuracies_top5 = []
losses = []
n_total = 0
for batch_id, (image, label) in enumerate(data_reader):
n_batch = len(label)
n_total = n_total + n_batch
label = paddle.unsqueeze(label, axis=1)
loss, acc = model.eval_batch([image], [label])
losses.append(loss[0])
accuracies_top1.append(acc[0][0]*n_batch)
accuracies_top5.append(acc[0][1]*n_batch)
if verbose == 1:
print('Batch:{}/{}, acc_top1:[{:.5f}], acc_top5:[{:.5f}]'.format(batch_id+1, len(data_reader), acc[0][0], acc[0][1]))
avg_loss = np.sum(losses)/n_total # loss 记录的是当前batch的累积值
avg_acc_top1 = np.sum(accuracies_top1)/n_total # metric 是当前batch的平均值
avg_acc_top5 = np.sum(accuracies_top5)/n_total
return avg_loss, avg_acc_top1, avg_acc_top5
##############################################################
if __name__ == '__main__':
# 设置输入样本的维度
input_spec = InputSpec(shape=[None] + args['input_size'], dtype='float32', name='image')
label_spec = InputSpec(shape=[None, 1], dtype='int64', name='label')
# 载入模型
network = paddle.vision.models.mobilenet_v2(num_classes=args['class_dim']) # 载入模型结构
model = paddle.Model(network, input_spec, label_spec) # 模型实例化
model.load(project_checkpoint_path) # 载入调优模型的参数
model.prepare(loss=paddle.nn.CrossEntropyLoss(), # 设置loss
metrics=paddle.metric.Accuracy(topk=(1,5))) # 设置评价指标
# 执行评估函数,并输出验证集样本的损失和精度
avg_loss, avg_acc_top1, avg_acc_top5 = eval(model, val_reader(), verbose=0)
print('[验证集] 损失: {:.5f}, top1精度:{:.5f}, top5精度为:{:.5f}'.format(avg_loss, avg_acc_top1, avg_acc_top5))
avg_loss, avg_acc_top1, avg_acc_top5 = eval(model, test_reader())
print('[测试集] 损失: {:.5f}, top1精度:{:.5f}, top5精度为:{:.5f}'.format(avg_loss, avg_acc_top1, avg_acc_top5))
[验证集] 损失: 0.00829, top1精度:0.88060, top5精度为:0.98507 [测试集] 损失: 0.03266, top1精度:0.84496, top5精度为:0.99225
基于Paddle高层进行计算的代码相对简单,但是存在一个问题。接口输出的loss是基于patch的,而非整个验证集,因此数据会存在一定的偏差。这种偏差是由于计算方法带来,因此在实际应用中,如果所有的样本和方法都是用相同的计算方法影响不大。但是对于不同的模型,可能因为各种原因设置不同的batch_size,这时,所获得的loss就不具备可比性了。 因此,在进行多模型的对比评价的时候,建议使用拆解的代码结构进行书写。
同时,建议尽量使用单样本loss进行多模型对比。
# 使用微调模型对验证集和测试集进行评估
import paddle
from config import train_parameters as args
from config import project_checkpoint_path
from paddle.static import InputSpec
from utils.reader import *
def eval(dataset):
result = model.evaluate(dataset, batch_size=args['batch_size'], verbose=0)
return result['loss'][0], result['acc_top1'], result['acc_top5']
##############################################################
if __name__ == '__main__':
# 设置输入样本的维度
input_spec=InputSpec(shape=[None] + args['input_size'], dtype='float32', name='image')
label_spec=InputSpec(shape=[None, 1], dtype='int64', name='label')
# 载入模型
network = paddle.vision.models.mobilenet_v2(num_classes=args['class_dim']) # 载入模型结构
model = paddle.Model(network, input_spec, label_spec) # 模型实例化
model.load(project_checkpoint_path) # 载入调优模型的参数
model.prepare(loss=paddle.nn.CrossEntropyLoss(), # 设置loss
metrics=paddle.metric.Accuracy(topk=(1,5))) # 设置评价指标
# 执行评估函数,并输出验证集样本的损失和精度
avg_loss, avg_acc_top1, avg_acc_top5 = eval(dataset_val)
print('[验证集] 损失: {:.5f}, top1精度:{:.5f}, top5精度为:{:.5f}'.format(avg_loss, avg_acc_top1, avg_acc_top5))
avg_loss, avg_acc_top1, avg_acc_top5 = eval(dataset_test)
print('[测试集] 损失: {:.5f}, top1精度:{:.5f}, top5精度为:{:.5f}'.format(avg_loss, avg_acc_top1, avg_acc_top5))
C:\Users\Administrator\anaconda3\lib\site-packages\paddle\fluid\layers\utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3, and in 3.9 it will stop working return (isinstance(seq, collections.Sequence) and
[验证集] 损失: 0.00317, top1精度:0.88060, top5精度为:0.98507 [测试集] 损失: 3.00281, top1精度:0.84496, top5精度为:0.99225
部署模型不包含优化器参数因此容量较小,但对验证集的评估通常都在实验室中完成,同时为了简化代码,期望可以被训练的代码复用,因此一般会采用基于调优模型进行编写。因此,推荐使用 2.1节(推荐)使用调优模型对验证集进行评估
的代码,对验证集进行评估。
同时,基于部署模型的代码可以使用对测试集的代码进行复用。
import numpy as np
import paddle
from config import train_parameters as args
from config import project_final_model_path
from utils.reader import val_reader, test_reader
import paddle.nn.functional as F
def eval(model, data_reader):
accuracies_top1 = []
accuracies_top5 = []
losses = []
n_total = 0
for batch_id, (image, label) in enumerate(data_reader):
n_batch = len(label)
n_total = n_total + n_batch
label = paddle.unsqueeze(label, axis=1) # 将label扩展为二阶张量
logits = model(image)
pred = F.softmax(logits)
acc_top1 = paddle.metric.accuracy(pred, label, k=1)
acc_top5 = paddle.metric.accuracy(pred, label, k=5)
loss = F.cross_entropy(logits, label)
avg_loss = paddle.mean(loss) # 获取loss值
losses.append(avg_loss.numpy())
accuracies_top1.append(acc_top1.numpy()*n_batch)
accuracies_top5.append(acc_top5.numpy()*n_batch)
avg_loss = np.sum(losses)/n_total
avg_acc_top1 = np.sum(accuracies_top1)/n_total
avg_acc_top5 = np.sum(accuracies_top5)/n_total
return avg_loss, avg_acc_top1, avg_acc_top5
##############################################################
if __name__ == '__main__':
# 载入模型
model = paddle.jit.load(project_final_model_path)
# 执行评估函数,并输出验证集样本的损失和精度
avg_loss, avg_acc_top1, avg_acc_top5 = eval(model, val_reader())
print('[验证集] 损失: {:.5f}, top1精度:{:.5f}, top5精度为:{:.5f}'.format(avg_loss, avg_acc_top1, avg_acc_top5))
avg_loss, avg_acc_top1, avg_acc_top5 = eval(model, test_reader())
print('[测试集] 损失: {:.5f}, top1精度:{:.5f}, top5精度为:{:.5f}'.format(avg_loss, avg_acc_top1, avg_acc_top5))
[验证集] 损失: 0.00829, top1精度:0.88060, top5精度为:0.98507 [测试集] 损失: 0.03266, top1精度:0.84496, top5精度为:0.99225