Commit 0d9082ac authored by daniel's avatar daniel

removed optimizations not mentioned in paper

parent 01b999be
......@@ -26,6 +26,6 @@ def setup_environment(name: str,
if 'FIRE' in env.unwrapped.get_action_meanings():
env = fire_reset.FireResetEnv(env)
env = image_transformation.ImageTransformationEnv(env)
env = reward_clipping.ClipRewardEnv(env)
# env = reward_clipping.ClipRewardEnv(env)
env = state_stack.StateStackEnv(env)
return env
......@@ -66,7 +66,7 @@ class AtariParameters(nn.Module):
self.action_output = nn.Linear(linear_output, output_size)
self.device = device
self.init_weights()
# self.init_weights()
self.to(device)
self.initial_learn_rate = learn_rate
self.optimizer = torch.optim.Adam(self.parameters(), lr=learn_rate, eps=adam_epsilon)
......@@ -113,8 +113,8 @@ class AtariParameters(nn.Module):
self.optimizer.zero_grad()
loss.backward()
self._log_gradient_norm()
if self.max_gradient_norm is not None:
torch.nn.utils.clip_grad_norm_(self.parameters(), self.max_gradient_norm)
# if self.max_gradient_norm is not None:
# torch.nn.utils.clip_grad_norm_(self.parameters(), self.max_gradient_norm)
self.optimizer.step()
def _log_gradient_norm(self):
......
......@@ -71,7 +71,8 @@ class ProximalPolicyOptimization:
rollout.values,
rollout.masks)
rollout.returns = self._calculate_returns(advantages, rollout.values[:-1])
rollout.advantages = normalize_tensor(advantages)
# rollout.advantages = normalize_tensor(advantages)
rollout.advantages = advantages
mini_batch_size = int(rollout.time_step / self.num_mini_batches) * self.num_envs
sampler = samplers.BatchSampler(sampler=samplers.SubsetRandomSampler(range(rollout.time_step * self.num_envs)),
......@@ -175,9 +176,10 @@ class ProximalPolicyOptimization:
raise ShapeError('Shapes of rewards ({}), new_values ({}) and old_values ({}) must be identical.'
.format(returns.shape, new_values.shape, old_values.shape))
losses = (new_values - returns) ** 2
clipped_values = old_values + (new_values - old_values).clamp(min=-self.clip_range, max=self.clip_range)
clipped_losses = (clipped_values - returns) ** 2
squared_error = 0.5 * torch.mean(torch.max(losses, clipped_losses))
squared_error = 0.5 * torch.mean(losses)
# clipped_values = old_values + (new_values - old_values).clamp(min=-self.clip_range, max=self.clip_range)
# clipped_losses = (clipped_values - returns) ** 2
# squared_error = 0.5 * torch.mean(torch.max(losses, clipped_losses))
return squared_error
def _calculate_advantages(self,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment