我正在嘗試通過torch保存和加載最佳模型,我在其中定義了我的訓練函式,如下所示:
def train_model(model, train_loader, test_loader, device, learning_rate=1e-1, num_epochs=200):
# The training configurations were not carefully selected.
criterion = nn.CrossEntropyLoss()
model.to(device)
# It seems that SGD optimizer is better than Adam optimizer for ResNet18 training on CIFAR10.
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-4)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=500)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[65, 75], gamma=0.75, last_epoch=-1)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
# Evaluation
model.eval()
eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
print("Epoch: {:02d} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(-1, eval_loss, eval_accuracy))
load_model = input('Load a model?')
for epoch in range(num_epochs):
if epoch//2 == 0:
write_checkpoint(model=model, epoch=epoch, scheduler=scheduler, optimizer=optimizer)
model, optimizer, epoch, scheduler = load_checkpoint(model=model, scheduler=scheduler, optimizer=optimizer)
for state in optimizer.state.values():
for k, v in state.items():
if isinstance(v, torch.Tensor):
state[k] = v.to(device)
# Training
model.train()
running_loss = 0
running_corrects = 0
for inputs, labels in train_loader:
inputs = torch.FloatTensor(inputs)
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward backward optimize
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# statistics
running_loss = loss.item() * inputs.size(0)
running_corrects = torch.sum(preds == labels.data)
train_loss = running_loss / len(train_loader.dataset)
train_accuracy = running_corrects / len(train_loader.dataset)
# Evaluation
model.eval()
eval_loss, eval_accuracy = evaluate_model(model=model, test_loader=test_loader, device=device, criterion=criterion)
# Set learning rate scheduler
scheduler.step()
print("Epoch: {:03d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(epoch, train_loss, train_accuracy, eval_loss, eval_accuracy))
return model
我希望能夠加載模型,并從保存模型的時代開始訓練。
到目前為止,我有方法可以通過以下方式保存模型、優化器、調度程式狀態和時代
def write_checkpoint(model, optimizer, epoch, scheduler):
state = {'epoch': epoch 1, 'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }
filename = '/content/model_'
torch.save(state, filename f'CP_epoch{epoch 1}.pth')
def load_checkpoint(model, optimizer, scheduler, filename='/content/checkpoint.pth'):
# Note: Input model & optimizer should be pre-defined. This routine only updates their states.
start_epoch = 0
if os.path.isfile(filename):
print("=> loading checkpoint '{}'".format(filename))
checkpoint = torch.load(filename)
start_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
scheduler = checkpoint['scheduler']
print("=> loaded checkpoint '{}' (epoch {})"
.format(filename, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(filename))
return model, optimizer, start_epoch, scheduler
但我似乎無法想出如何更新時代以從正確的時代開始的邏輯。尋找有關如何實作這一點的提示或想法。
uj5u.com熱心網友回復:
如果我理解正確,您嘗試使用正確的紀元數從上次進度恢復訓練。
在呼叫train_model加載檢查點值之前,包括start_epoch. 然后start_epoch用作回圈起點,
for epoch in range(start_epoch, num_epochs):
轉載請註明出處,本文鏈接:https://www.uj5u.com/caozuo/520800.html
標籤:机器学习火炬
下一篇:TypeError:只有整數、切片(`:`)、省略號(`...`)、tf.newaxis(`None`)和標量tf.int32/tf.int64張量是有效的索引
