Hi, Thank you for providing us a wonderful code. I am trying to adopt IQ method in my custom environment. However, I faced with diverging loss critic loss function. I tried to copy and paste the original code from github but this event is happening again and again. Is it a normal event if IQ imitation learning method is combined with SAC or am i using it in a wrong way. I uploaded my code with post. I also upload my loss function together.
class IQ(nn.Module):
def __init__(self, args):
super(IQ, self).__init__()
self.args = args
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.actor = Actor(self.args).to(self.device)
self.q = Critic(self.args).to(self.device)
# self.q_2 = Critic(self.args).to(self.device)
self.target_q = Critic(self.args).to(self.device)
# self.target_q_2 = Critic(self.args).to(self.device)
self.soft_update(self.q, self.target_q, 1.)
# self.soft_update(self.q_2, self.target_q_2, 1.)
# self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init))
self.log_alpha = nn.Parameter(torch.log(torch.tensor(1e-3)))
self.target_entropy = - torch.tensor(self.args.p_len * self.args.state_dim)
self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr)
# self.q_2_optimizer = optim.Adam(self.q_2.parameters(), lr=self.args.q_lr)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr)
self.alpha_optimizer = optim.Adam([self.log_alpha], lr=self.args.q_lr)
# check directory
isExist = os.path.exists(self.args.pretrain_model_dir)
if not isExist:
os.mkdir(self.args.pretrain_model_dir)
@property
def alpha(self):
return self.log_alpha.exp()
def get_action(self, depth, imu, dir_vector):
# normalization
depth, imu, dir_vector = self.normalization(depth, imu, dir_vector)
mu,std = self.actor(depth, imu, dir_vector)
std = std.exp()
dist = Normal(mu, std)
u = dist.rsample()
u_log_prob = dist.log_prob(u)
a = torch.tanh(u)
a_log_prob = u_log_prob - torch.log(1 - torch.square(a) +1e-3)
return a, a_log_prob.sum(-1, keepdim=True)
def q_update(self, current_Q, current_v, next_v, done_masks, is_expert):
# calculate 1st term for IQ loss
# -E_(ρ_expert)[Q(s, a) - γV(s')]
with torch.no_grad():
y = (1 - done_masks) * self.args.gamma * next_v
reward = (current_Q - y)[is_expert]
# our proposed unbiased form for fixing kl divergence
# 1st loss function
phi_grad = torch.exp(-reward)
loss = -(phi_grad * reward).mean()
######
# sample using expert and policy states (works online)
# E_(ρ)[V(s) - γV(s')], 2nd loss function
value_loss = (current_v - y).mean()
loss += value_loss
# Use χ2 divergence (calculate the regularization term for IQ loss using expert and policy states) (works online)
reward = current_Q - y
# alpha value가 fixed 형태로 0.5로 설정되어 있음
# chi2_loss = 1/(4 * self.alpha) * (reward**2).mean()
chi2_loss = 1/(4 * 0.5) * (reward**2).mean()
loss += chi2_loss
######
return loss
def train_network(self, writer, n_epi, train_memory):
print("SAC UPDATE")
depth, imu, dir_vector, actions, rewards, next_depth, next_imu, next_dir_vector, done_masks, is_expert = \
self.get_samples(train_memory)
q1, q2 = self.q(depth, imu, dir_vector, actions)
v1, v2 = self.getV(self.q, depth, imu, dir_vector)
with torch.no_grad():
next_v1, next_v2 = self.get_targetV(self.target_q, next_depth, next_imu, next_dir_vector)
#q_update
q1_loss = self.q_update(q1, v1, next_v1, done_masks, is_expert)
q2_loss = self.q_update(q2, v2, next_v2, done_masks, is_expert)
# define critic loss
critic_loss = 1/2 * (q1_loss + q2_loss)
# update
self.q_optimizer.zero_grad()
critic_loss.backward()
torch.nn.utils.clip_grad_norm_(self.q.parameters(), 1.0)
# step critic
self.q_optimizer.step()
### actor update
actor_loss,prob = self.actor_update(depth, imu, dir_vector)
###alpha update
# alpha_loss = self.alpha_update(prob)
self.soft_update(self.q, self.target_q, self.args.soft_update_rate)
self.soft_update(self.q_2, self.target_q_2, self.args.soft_update_rate)
if writer != None:
writer.add_scalar("loss/q_1", q1_loss, n_epi)
writer.add_scalar("loss/q_2", q2_loss, n_epi)
writer.add_scalar("loss/actor_loss", actor_loss, n_epi)
writer.add_scalar("loss/alpha", alpha_loss, n_epi)
# save model
if np.mod(n_epi, self.args.save_period)==0 and n_epi > 0:
# save models
torch.save(self.actor.state_dict(), self.args.pretrain_model_dir + str('actor.pt'))
def actor_update(self, depth, imu, dir_vector):
now_actions, now_action_log_prob = self.get_action(depth, imu, dir_vector)
q_1, q_2 = self.q(depth, imu, dir_vector, now_actions)
q = torch.min(q_1, q_2)
loss = (self.alpha.detach() * now_action_log_prob - q).mean()
self.actor_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1.0)
self.actor_optimizer.step()
return loss,now_action_log_prob
def alpha_update(self, now_action_log_prob):
loss = (- self.alpha * (now_action_log_prob + self.target_entropy).detach()).mean()
self.alpha_optimizer.zero_grad()
loss.backward()
self.alpha_optimizer.step()
return loss
def soft_update(self, network, target_network, rate):
for network_params, target_network_params in zip(network.parameters(), target_network.parameters()):
target_network_params.data.copy_(target_network_params.data * (1.0 - rate) + network_params.data * rate)
def get_expert_data(self):
# define train and validation dataset
self.expert_dataloader = DataLoader(True, self.args)
# load expert and training dataset
expert_depth, expert_imu, expert_dir_vector, expert_action, reward, done, expert_next_depth, expert_next_imu, expert_next_dir_vector \
= self.expert_dataloader.__getitem__(batch_size=self.args.discrim_batch_size)
# prepocessing training label
expert_action = self.label_preprocessing(expert_action)
# convert numpy array into tensor
expert_depth = torch.Tensor(expert_depth).cuda()
expert_imu = torch.Tensor(expert_imu).cuda()
expert_dir_vector = torch.Tensor(expert_dir_vector).cuda()
expert_action = torch.Tensor(expert_action).cuda()
expert_next_depth = torch.Tensor(expert_next_depth).cuda()
expert_next_imu = torch.Tensor(expert_next_imu).cuda()
expert_next_dir_vector = torch.Tensor(expert_next_dir_vector).cuda()
return expert_depth, expert_imu, expert_dir_vector, expert_action, expert_next_depth, expert_next_imu, expert_next_dir_vector
def getV(self, critic, depth, imu, dir_vector):
action, log_prob = self.get_action(depth, imu, dir_vector)
current_Q1, current_Q2 = critic(depth, imu, dir_vector, action)
current_V1 = current_Q1 - self.alpha.detach() * log_prob
current_V2 = current_Q2 - self.alpha.detach() * log_prob
return current_V1, current_V2
def get_targetV(self, critic_target, depth, imu, dir_vector):
action, log_prob = self.get_action(depth, imu, dir_vector)
target_Q1, target_Q2 = critic_target(depth, imu, dir_vector, action)
target_V1 = target_Q1 - self.alpha.detach() * log_prob
target_V2 = target_Q2 - self.alpha.detach() * log_prob
return target_V1, target_V2```