Hello! Thank you for your work, it's a great contribution.
I am experiencing an unexpected behaviour, which according to my experiments seems to be caused by setting the learning rate on 0.1. I am training on a subset of 200 examples from the CLWD dataset.
I have a training loop copied from your train.py, I set the lr as 0.1 on the args. If I execute the training, I see the loss L1 going down but when I retrieved the model and do inference on an example from the training set, I get an output that matches exactly the input. How can that be? It’s as if the model wasn’t being applied.
CROP_SIZE = 384
DATASET="CLWD"
DATASET_DIR = "media/mini-CLWD-train-equal-test"
# DATASET_DIR = "media/10CLWD-train-equal-test"
TEST_DIR = DATASET_DIR
CHECKPOINT_DIR = "checkpoint"
RESUME_DIR = ""
ARGS = dict(checkpoint=CHECKPOINT_DIR,
crop_size=CROP_SIZE,
dataset='clwd',
dataset_dir=DATASET_DIR, debug=False,
epochs=21,
evaluate=False,
freq=-1,
lr=0.1,
schedule=[100,200,300],
gamma=0.1,
sigma_decay=0,
resume=RESUME_DIR,
start_epoch=0,
nets='slbr',
test_dir=TEST_DIR,
train_batch=6,
dlr=0.001,
data='', data_augumentation=False,
finetune='', flip=False,
alpha=0.5, beta1=0.9, beta2=0.999, bg_mode='res_mask',
gan_norm=False, gpu=True, gpu_id='0', hl=False,
input_size=256, k_center=2, k_refine=3, k_skip_stage=3,
lambda_content=0, lambda_iou=0, lambda_l1=4, lambda_mask=1,
lambda_primary=0.01, lambda_style=0, loss_type='l2',
mask_mode='res', masked=False, momentum=0,
name='slbr_v1', no_flip=True, normalized_input=False,
preprocess='resize', project_mode='simple', requires_grad=False,
res=False,
sim_metric='cos', sltype='vggx', test_batch=1,
use_refine=True, weight_decay=0, workers=1)
class DictArgs(object):
def __init__(self, d):
for k, v in d.items():
setattr(self, k, v)
ARGS = DictArgs(ARGS)
from __future__ import print_function, absolute_import
import argparse
import torch,time,os
torch.backends.cudnn.benchmark = True
from src.utils.misc import save_checkpoint, adjust_learning_rate
import src.models as models
import datasets as datasets
from options import Options
import numpy as np
def train(args):
args.seed = 1
np.random.seed(args.seed)
torch.manual_seed(args.seed)
MODEL_NAME = "SLBR"
args.dataset = args.dataset.lower()
if args.dataset == 'clwd':
dataset_func = datasets.CLWDDataset
elif args.dataset == 'lvw':
dataset_func = datasets.LVWDataset
else:
raise ValueError("Not known dataset:\t{}".format(args.dataset))
train_loader = torch.utils.data.DataLoader(dataset_func('train',args),batch_size=args.train_batch, shuffle=True,
num_workers=args.workers, pin_memory=True)
val_loader = torch.utils.data.DataLoader(dataset_func('val',args),batch_size=args.test_batch, shuffle=False,
num_workers=args.workers, pin_memory=True)
lr = args.lr
data_loaders = (train_loader,val_loader)
# model = models.__dict__[args.models](datasets=data_loaders, args=args)
model = models.__dict__[MODEL_NAME](datasets=data_loaders, args=ARGS)
print('============================ Initization Finish && Training Start =============================================')
print(f"It will start in epoch: {model.args.start_epoch}\nIt will end in epoch: {model.args.epochs}")
for epoch in range(model.args.start_epoch, model.args.epochs):
lr = adjust_learning_rate(data_loaders, model, epoch, lr, args)
print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr))
model.record('lr',lr, epoch)
model.train(epoch)
# save model
save_epochs = {1,10,20,50,100,200,300,400}
if epoch in save_epochs:
model.validate(epoch)
model.flush() #algo que tiene que ver con memoria
print(f"Saving checkpoint of epoch {epoch}")
model.save_checkpoint(filename=f"checkpoint{epoch}.pth.tar")
# model.validate(epoch)
# if args.freq < 0:
# model.validate(epoch)
# model.flush()
# model.save_checkpoint()
return model
final_model = train(ARGS)
# helper functions
### setup
import os
import time
from pathlib import Path
import torch
import torch.nn.functional as F
import cv2
import numpy as np
import datasets as datasets
import src.models as models
torch.backends.cudnn.benchmark = True
# TEST_DIR = "test_imgs/"
# MODEL = "SLBR"
# CROP_SIZE = 384
# ONLY_PRED = True
#helper functions
def tensor2np(x, isMask=False):
if isMask:
if x.shape[1] == 1:
x = x.repeat(1,3,1,1)
x = ((x.cpu().detach()))*255
else:
x = x.cpu().detach()
mean = 0
std = 1
x = (x * std + mean)*255
return x.numpy().transpose(0,2,3,1).astype(np.uint8)
def save_output(inputs, preds, save_dir, img_fn, extra_infos=None, verbose=False, alpha=0.5, only_pred=True):
outs = []
image = inputs['I'] #, inputs['bg'], inputs['mask']
image = cv2.cvtColor(tensor2np(image)[0], cv2.COLOR_RGB2BGR)
bg_pred,mask_preds = preds['bg'], preds['mask']
bg_pred = cv2.cvtColor(tensor2np(bg_pred)[0], cv2.COLOR_RGB2BGR)
mask_pred = tensor2np(mask_preds, isMask=True)[0]
outs = [image, bg_pred, mask_pred]
if only_pred:
outimg = bg_pred
else:
outimg = np.concatenate(outs, axis=1)
if verbose==True:
cv2.imshow("out",outimg)
cv2.waitKey(0)
else:
img_fn = os.path.split(img_fn)[-1]
out_fn = os.path.join(save_dir, "{}{}".format(os.path.splitext(img_fn)[0], os.path.splitext(img_fn)[1]))
cv2.imwrite(out_fn, outimg)
def preprocess(file_path, img_size=512):
img_J = cv2.imread(file_path)
assert img_J is not None, "NoneType"
h,w,_ = img_J.shape
img_J = cv2.cvtColor(img_J, cv2.COLOR_BGR2RGB).astype(float)/255.
img_J = torch.from_numpy(img_J.transpose(2,0,1)[np.newaxis,...]) #[1,C,H,W]
img_J = F.interpolate(img_J, size=(img_size, img_size), mode='bilinear')
return img_J, (h, w)
def test_dataloader(folder, crop_size):
files = [x.as_posix() for pat in ["*.jpg", "*.jpeg", "*.png"] for x in Path(folder).glob(pat) ]
loaders = []
save_fns = []
orig_dims = []
for file in files:
processed, orig_size = preprocess(file, img_size=crop_size)
loaders.append(processed)
save_fns.append(file)
orig_dims.append(orig_size)
return loaders, save_fns, orig_dims
import matplotlib.pyplot as plt
def sanity_check(checkpoint_path: str, image_index: int=0):
MODEL="SLBR"
CROP_SIZE = 384
DATASET="CLWD"
DATASET_DIR = "media/mini-CLWD-train-equal-test"
TEST_DIR = DATASET_DIR
CHECKPOINT_DIR = "checkpoint"
RESUME_DIR = checkpoint_path
ARGS = dict(checkpoint=CHECKPOINT_DIR,
crop_size=CROP_SIZE,
dataset='clwd',
dataset_dir=DATASET_DIR, debug=False,
dlr=0.001,
epochs=100,
evaluate=True,
finetune='', flip=False,
freq=-1,
lr=0.01,
resume=RESUME_DIR,
start_epoch=0,
nets='slbr',
test_dir=TEST_DIR,
train_batch=6,
data='', data_augumentation=False,
alpha=0.5, beta1=0.9, beta2=0.999, bg_mode='res_mask',
gamma=0.1, gan_norm=False, gpu=True, gpu_id='0', hl=False,
input_size=256, k_center=2, k_refine=3, k_skip_stage=3,
lambda_content=0, lambda_iou=0, lambda_l1=4, lambda_mask=1,
lambda_primary=0.01, lambda_style=0, loss_type='l2',
mask_mode='res', masked=False, models='slbr', momentum=0,
name='slbr_v1', no_flip=True, normalized_input=False,
preprocess='resize', project_mode='simple', requires_grad=False,
res=False,
schedule=[5, 10], sigma_decay=0, sim_metric='cos', sltype='vggx', test_batch=1,
use_refine=True, weight_decay=0, workers=2)
class DictArgs(object):
def __init__(self, d):
for k, v in d.items():
setattr(self, k, v)
ARGS = DictArgs(ARGS)
model = models.__dict__[MODEL](datasets=(None, None), args=ARGS)
### inference and comparison
TEST_DIR = "media/mini-CLWD-train-equal-test/train/Watermarked_image"
doc_loader, fns, orig_dims = test_dataloader(TEST_DIR, ARGS.crop_size)
input_example = doc_loader[image_index]
input_example = input_example.to(model.device).float()
imoutput, immask_all, imwatermark = model.model(input_example)
imoutput = imoutput[0]
immask = immask_all[0]
imfinal = imoutput * immask + model.norm(input_example) * (1-immask)
print(f"Boolean comparison of tensors (torch.equal): {torch.equal(input_example, imfinal)}")
input_example_p = tensor2np(F.interpolate(input_example, size=orig_dims[10], mode="bilinear"))[0]
imfinal_p = tensor2np(F.interpolate(imfinal, size=orig_dims[10], mode="bilinear"))[0]
print(f"Boolean comparison of processed numpy objects showed (np.array_equal): {np.array_equal(imfinal_p,input_example_p)}")
fig, axes = plt.subplots(1,2)
axes[0].imshow(input_example_p)
axes[1].imshow(imfinal_p)
sanity_check("checkpoint/slbr_v1/checkpoint20.pth.tar", image_index=0)
Notice that I pass the checkpoint of the model trained on the resume_path variable. I do inference on a given example of the train set, I check if the matrix are equal and also check the image.
Images are the same and they are equal, entry by entry. I've tried other saved checkpoints as well. What could be going on?
Thanks for your support.